Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cols to mapcols and mapcols! #3386

Merged
merged 12 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
column names only to a subset of the columns specified by the `cols`
keyword argument
([#3380](https://github.com/JuliaData/DataFrames.jl/pull/3380))
* `mapcols` and `mapcols!` now allow to apply a function transforming
columns only to a subset of the columns specified by the `cols`
keyword argument
([#3386](https://github.com/JuliaData/DataFrames.jl/pull/3386))

## Bug fixes

Expand Down
78 changes: 61 additions & 17 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,20 @@ as a `DataFrameRows` over a view of rows of parent of `dfr`.
julia> collect(Iterators.partition(eachrow(DataFrame(x=1:5)), 2))
3-element Vector{DataFrames.DataFrameRows{SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}}}:
2×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 1
2 │ 2
2×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 3
2 │ 4
1×1 DataFrameRows
Row │ x
│ Int64
Row │ x
│ Int64
─────┼───────
1 │ 5
```
Expand Down Expand Up @@ -408,12 +408,17 @@ Base.show(dfcs::DataFrameColumns;
summary=summary, eltypes=eltypes, truncate=truncate, kwargs...)

"""
mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All())

Return a `DataFrame` where each column of `df` selected by `cols` (by default, all columns)
is transformed using function `f`.
Columns not selected by `cols` are copied.

Return a `DataFrame` where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).

The `cols` column selector can be any value accepted as column selector by the `names` function.

Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
`DataFrame`. If `f` returns its argument then it gets copied before being stored.

Expand All @@ -440,15 +445,32 @@ julia> mapcols(x -> x.^2, df)
2 │ 4 144
3 │ 9 169
4 │ 16 196

julia> mapcols(x -> x.^2, df, cols=r"y")
4×2 DataFrame
Row │ x y
│ Int64 Int64
─────┼──────────────
1 │ 1 121
2 │ 2 144
3 │ 3 169
4 │ 4 196
```
"""
function mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
function mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All())
if cols === All() || cols === Colon()
apply = Iterators.repeated(true)
else
picked = Set(names(df, cols))
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
apply = Bool[name in picked for name in names(df)]
end

# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
for (v, doapply) in zip(eachcol(df), apply)
fv = doapply ? f(v) : copy(v)
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
Expand All @@ -470,9 +492,12 @@ function mapcols(f::Union{Function, Type}, df::AbstractDataFrame)
end

"""
mapcols!(f::Union{Function, Type}, df::DataFrame)
mapcols!(f::Union{Function, Type}, df::DataFrame; cols=All())

Update a `DataFrame` in-place where each column of `df` selected by `cols` (by default, all columns)
is transformed using function `f`.
Columns not selected by `cols` are left unchanged.

Update a `DataFrame` in-place where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).

Expand Down Expand Up @@ -503,20 +528,39 @@ julia> df
2 │ 4 144
3 │ 9 169
4 │ 16 196

julia> mapcols!(x -> 2 * x, df, cols=r"x");

julia> df
4×2 DataFrame
Row │ x y
│ Int64 Int64
─────┼──────────────
1 │ 2 121
2 │ 8 144
3 │ 18 169
4 │ 32 196
```
"""
function mapcols!(f::Union{Function, Type}, df::DataFrame)
# note: `f` must return a consistent length
function mapcols!(f::Union{Function,Type}, df::DataFrame; cols=All())
if ncol(df) == 0 # skip if no columns
_drop_all_nonnote_metadata!(df)
return df
end

if cols === All() || cols === Colon()
apply = Iterators.repeated(true)
else
picked = Set(names(df, cols))
apply = Bool[name in picked for name in names(df)]
end

# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
for (v, doapply) in zip(eachcol(df), apply)
fv = doapply ? f(v) : v
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
Expand Down
25 changes: 25 additions & 0 deletions test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,19 @@ end
df = mapcols(x -> 2:2, df)
@test df == DataFrame(a=2)
@test df.a isa Vector{Int}

df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols(x -> 2x, df, cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4])
@test mapcols(x -> 2x, df, cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
@test mapcols(x -> 2x, df, cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols(x -> 2x, df, cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])

df2 = mapcols(x -> 2x, df, cols="b")
@test df2.a1 == df.a1 && df2.a1 !== df.a1
@test df2.a2 == df.a2 && df2.a2 !== df.a2
@test df2.b == 2*df.b
end
bkamins marked this conversation as resolved.
Show resolved Hide resolved

@testset "mapcols!" begin
Expand Down Expand Up @@ -109,6 +122,18 @@ end
mapcols!(x -> 2:2, df)
@test df == DataFrame(a=2)
@test df.a isa Vector{Int}

df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
@test mapcols!(x -> 2x, copy(df), cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test mapcols!(x -> 2x, copy(df), cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8])
a1, a2, b = eachcol(df)
mapcols!(x -> 2x, df, cols=Not(All()))
@test df == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4])
@test df.a1 === a1 && df.a2 === a2 && df.b === b
end

@testset "SubDataFrame" begin
Expand Down
Loading