From 1a5da8a1d79f457f9f97080eef8cf2758ec63ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 17 Oct 2023 11:57:28 +0200 Subject: [PATCH] add cols to mapcols and mapcols! (#3386) --- NEWS.md | 4 ++ src/abstractdataframe/iteration.jl | 78 +++++++++++++++++++++++------- test/iteration.jl | 25 ++++++++++ 3 files changed, 90 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8a46730da..14b1f1fd1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,10 @@ column names only to a subset of the columns specified by the `cols` keyword argument ([#3380](https://github.com/JuliaData/DataFrames.jl/pull/3380)) +* `mapcols` and `mapcols!` now allow to apply a function transforming + columns only to a subset of the columns specified by the `cols` + keyword argument + ([#3386](https://github.com/JuliaData/DataFrames.jl/pull/3386)) ## Bug fixes diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 3fdc0523d..c81228fb1 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -107,20 +107,20 @@ as a `DataFrameRows` over a view of rows of parent of `dfr`. julia> collect(Iterators.partition(eachrow(DataFrame(x=1:5)), 2)) 3-element Vector{DataFrames.DataFrameRows{SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}}}: 2×1 DataFrameRows - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 1 2 │ 2 2×1 DataFrameRows - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 3 2 │ 4 1×1 DataFrameRows - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 5 ``` @@ -408,12 +408,17 @@ Base.show(dfcs::DataFrameColumns; summary=summary, eltypes=eltypes, truncate=truncate, kwargs...) """ - mapcols(f::Union{Function, Type}, df::AbstractDataFrame) + mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All()) + +Return a `DataFrame` where each column of `df` selected by `cols` (by default, all columns) +is transformed using function `f`. +Columns not selected by `cols` are copied. -Return a `DataFrame` where each column of `df` is transformed using function `f`. `f` must return `AbstractVector` objects all with the same length or scalars (all values other than `AbstractVector` are considered to be a scalar). +The `cols` column selector can be any value accepted as column selector by the `names` function. + Note that `mapcols` guarantees not to reuse the columns from `df` in the returned `DataFrame`. If `f` returns its argument then it gets copied before being stored. @@ -440,15 +445,32 @@ julia> mapcols(x -> x.^2, df) 2 │ 4 144 3 │ 9 169 4 │ 16 196 + +julia> mapcols(x -> x.^2, df, cols=r"y") +4×2 DataFrame + Row │ x y + │ Int64 Int64 +─────┼────────────── + 1 │ 1 121 + 2 │ 2 144 + 3 │ 3 169 + 4 │ 4 196 ``` """ -function mapcols(f::Union{Function, Type}, df::AbstractDataFrame) +function mapcols(f::Union{Function, Type}, df::AbstractDataFrame; cols=All()) + if cols === All() || cols === Colon() + apply = Iterators.repeated(true) + else + picked = Set(names(df, cols)) + apply = Bool[name in picked for name in names(df)] + end + # note: `f` must return a consistent length vs = AbstractVector[] seenscalar = false seenvector = false - for v in eachcol(df) - fv = f(v) + for (v, doapply) in zip(eachcol(df), apply) + fv = doapply ? f(v) : copy(v) if fv isa AbstractVector if seenscalar throw(ArgumentError("mixing scalars and vectors in mapcols not allowed")) @@ -470,9 +492,12 @@ function mapcols(f::Union{Function, Type}, df::AbstractDataFrame) end """ - mapcols!(f::Union{Function, Type}, df::DataFrame) + mapcols!(f::Union{Function, Type}, df::DataFrame; cols=All()) + +Update a `DataFrame` in-place where each column of `df` selected by `cols` (by default, all columns) +is transformed using function `f`. +Columns not selected by `cols` are left unchanged. -Update a `DataFrame` in-place where each column of `df` is transformed using function `f`. `f` must return `AbstractVector` objects all with the same length or scalars (all values other than `AbstractVector` are considered to be a scalar). @@ -503,20 +528,39 @@ julia> df 2 │ 4 144 3 │ 9 169 4 │ 16 196 + +julia> mapcols!(x -> 2 * x, df, cols=r"x"); + +julia> df +4×2 DataFrame + Row │ x y + │ Int64 Int64 +─────┼────────────── + 1 │ 2 121 + 2 │ 8 144 + 3 │ 18 169 + 4 │ 32 196 ``` """ -function mapcols!(f::Union{Function, Type}, df::DataFrame) - # note: `f` must return a consistent length +function mapcols!(f::Union{Function,Type}, df::DataFrame; cols=All()) if ncol(df) == 0 # skip if no columns _drop_all_nonnote_metadata!(df) return df end + if cols === All() || cols === Colon() + apply = Iterators.repeated(true) + else + picked = Set(names(df, cols)) + apply = Bool[name in picked for name in names(df)] + end + + # note: `f` must return a consistent length vs = AbstractVector[] seenscalar = false seenvector = false - for v in eachcol(df) - fv = f(v) + for (v, doapply) in zip(eachcol(df), apply) + fv = doapply ? f(v) : v if fv isa AbstractVector if seenscalar throw(ArgumentError("mixing scalars and vectors in mapcols not allowed")) diff --git a/test/iteration.jl b/test/iteration.jl index 2202b33d9..4c1b9d0d1 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -78,6 +78,19 @@ end df = mapcols(x -> 2:2, df) @test df == DataFrame(a=2) @test df.a isa Vector{Int} + + df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4]) + @test mapcols(x -> 2x, df, cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4]) + @test mapcols(x -> 2x, df, cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8]) + @test mapcols(x -> 2x, df, cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8]) + @test mapcols(x -> 2x, df, cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8]) + @test mapcols(x -> 2x, df, cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4]) + @test mapcols(x -> 2x, df, cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8]) + + df2 = mapcols(x -> 2x, df, cols="b") + @test df2.a1 == df.a1 && df2.a1 !== df.a1 + @test df2.a2 == df.a2 && df2.a2 !== df.a2 + @test df2.b == 2*df.b end @testset "mapcols!" begin @@ -109,6 +122,18 @@ end mapcols!(x -> 2:2, df) @test df == DataFrame(a=2) @test df.a isa Vector{Int} + + df = DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4]) + @test mapcols!(x -> 2x, copy(df), cols=r"a") == DataFrame(a1=[2, 4], a2=[4, 6], b=[3, 4]) + @test mapcols!(x -> 2x, copy(df), cols="b") == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8]) + @test mapcols!(x -> 2x, copy(df), cols=Not(r"a")) == DataFrame(a1=[1, 2], a2=[2, 3], b=[6, 8]) + @test mapcols!(x -> 2x, copy(df), cols=Int) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8]) + @test mapcols!(x -> 2x, copy(df), cols=Not(All())) == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4]) + @test mapcols!(x -> 2x, copy(df), cols=:) == DataFrame(a1=[2, 4], a2=[4, 6], b=[6, 8]) + a1, a2, b = eachcol(df) + mapcols!(x -> 2x, df, cols=Not(All())) + @test df == DataFrame(a1=[1, 2], a2=[2, 3], b=[3, 4]) + @test df.a1 === a1 && df.a2 === a2 && df.b === b end @testset "SubDataFrame" begin