From 103fe0d8c064f6c5acbef812baa10abed5517d80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 25 Sep 2023 10:25:23 +0200 Subject: [PATCH 1/2] Add JSS citation information (#3381) --- CITATION.bib | 12 ++++++++++++ README.md | 7 ++++++- docs/src/index.md | 3 ++- 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 CITATION.bib diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 000000000..dedc38de8 --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,12 @@ +@article{JSSv107i04, + title={DataFrames.jl: Flexible and Fast Tabular Data in Julia}, + volume={107}, + url={https://www.jstatsoft.org/index.php/jss/article/view/v107i04}, + doi={10.18637/jss.v107.i04}, + abstract={DataFrames.jl is a package written for and in the Julia language offering flexible and efficient handling of tabular data sets in memory. Thanks to Julia’s unique strengths, it provides an appealing set of features: Rich support for standard data processing tasks and excellent flexibility and efficiency for more advanced and non-standard operations. We present the fundamental design of the package and how it compares with implementations of data frames in other languages, its main features, performance, and possible extensions. We conclude with a practical illustration of typical data processing operations.}, + number={4}, + journal={Journal of Statistical Software}, + author={Bouchet-Valat, Milan and Kamiński, Bogumił}, + year={2023}, + pages={1--32} +} \ No newline at end of file diff --git a/README.md b/README.md index b51e91f3c..1a660f02c 100644 --- a/README.md +++ b/README.md @@ -31,4 +31,9 @@ that is available on GitHub. [docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg [docs-stable-url]: http://dataframes.juliadata.org/stable/ -**Citing**: For now, the best way of citing this package is using the [Zenodo link](https://doi.org/10.5281/zenodo.7632427). +**Citing**: We encourage you to cite our work if you have used DataFrames.jl package. +Starring the DataFrames.jl repository on GitHub is also appreciated. + +The citation information may be found in the [CITATION.bib](CITATION.bib) file within the repository: + +> Bouchet-Valat, M., & Kamiński, B. (2023). DataFrames.jl: Flexible and Fast Tabular Data in Julia. Journal of Statistical Software, 107(4), 1–32. https://doi.org/10.18637/jss.v107.i04 diff --git a/docs/src/index.md b/docs/src/index.md index 1d7511908..66ed6f3e5 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -8,6 +8,7 @@ running with tabular data manipulation using the DataFrames.jl package. For more illustrations of DataFrames.jl usage, in particular in conjunction with other packages you can check-out the following resources (they are kept up to date with the released version of DataFrames.jl): +* [DataFrames.jl: Flexible and Fast Tabular Data in Julia](https://www.jstatsoft.org/article/view/v107i04) article published in the *Journal of Statistical Software* * [Data Wrangling with DataFrames.jl Cheat Sheet](https://www.ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/) * [DataFrames Tutorial using Jupyter Notebooks](https://github.com/bkamins/Julia-DataFrames-Tutorial/) * [Julia Academy DataFrames.jl tutorial](https://github.com/JuliaAcademy/DataFrames) @@ -277,7 +278,7 @@ missing please kindly report an issue during which it is deprecated. The situations where such a breaking change might be allowed are (still such breaking changes will be avoided if possible): - + * the affected functionality was previously clearly identified in the documentation as being subject to changes (for example in DataFrames.jl 1.4 release propagation rules of `:note`-style metadata are documented as such); From 7aec87da8fec28013f54c19e23e4ba98350a80f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 25 Sep 2023 22:20:04 +0200 Subject: [PATCH 2/2] add cols kwarg to rename/rename! (#3380) --- NEWS.md | 9 +++++ src/abstractdataframe/abstractdataframe.jl | 42 +++++++++++++++------- src/other/index.jl | 2 -- test/dataframe.jl | 13 +++++++ test/index.jl | 6 +--- 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 13455915d..bc0606d74 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# DataFrames.jl v1.7.0 Release Notes + +## New functionalities + +* `rename` and `rename!` now allow to apply a function transforming + column names only to a subset of the columns specified by the `cols` + keyword argument + ([#3380](https://github.com/JuliaData/DataFrames.jl/pull/3380)) + # DataFrames.jl v1.6.1 Release Notes ## Bug fixes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a40627c6a..600601506 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -123,7 +123,7 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df), rename!(df::AbstractDataFrame, (from => to)::Pair...) rename!(df::AbstractDataFrame, d::AbstractDict) rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair}) - rename!(f::Function, df::AbstractDataFrame) + rename!(f::Function, df::AbstractDataFrame; cols=All()) Rename columns of `df` in-place. Each name is changed at most once. Permutation of names is allowed. @@ -132,8 +132,10 @@ Each name is changed at most once. Permutation of names is allowed. - `df` : the `AbstractDataFrame` - `d` : an `AbstractDict` or an `AbstractVector` of `Pair`s that maps the original names or column numbers to new names -- `f` : a function which for each column takes the old name as a `String` - and returns the new name that gets converted to a `Symbol` +- `f` : a function which for each column selected by the `cols` keyword argument + takes the old name as a `String` + and returns the new name that gets converted to a `Symbol`; the `cols` + column selector can be any value accepted as column selector by the `names` function - `vals` : new column names as a vector of `Symbol`s or `AbstractString`s of the same length as the number of columns in `df` - `makeunique` : if `false` (the default), an error will be raised @@ -194,6 +196,14 @@ julia> rename!(uppercase, df) │ Int64 Int64 Int64 ─────┼───────────────────── 1 │ 1 2 3 + +julia> rename!(lowercase, df, cols=contains('A')) +1×3 DataFrame + Row │ a B a_1 + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 2 3 + ``` """ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; @@ -252,12 +262,8 @@ end rename!(df::AbstractDataFrame, args::Pair...) = rename!(df, collect(args)) -function rename!(f::Function, df::AbstractDataFrame) - rename!(f, index(df)) - # renaming columns of SubDataFrame has to clean non-note metadata in its parent - _drop_all_nonnote_metadata!(parent(df)) - return df -end +rename!(f::Function, df::AbstractDataFrame; cols=All()) = + rename!(df, [n => Symbol(f(n)) for n in names(df, cols)]) """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; @@ -267,7 +273,7 @@ end rename(df::AbstractDataFrame, (from => to)::Pair...) rename(df::AbstractDataFrame, d::AbstractDict) rename(df::AbstractDataFrame, d::AbstractVector{<:Pair}) - rename(f::Function, df::AbstractDataFrame) + rename(f::Function, df::AbstractDataFrame; cols=All()) Create a new data frame that is a copy of `df` with changed column names. Each name is changed at most once. Permutation of names is allowed. @@ -277,8 +283,10 @@ Each name is changed at most once. Permutation of names is allowed. only allowed if it was created using `:` as a column selector. - `d` : an `AbstractDict` or an `AbstractVector` of `Pair`s that maps the original names or column numbers to new names -- `f` : a function which for each column takes the old name as a `String` - and returns the new name that gets converted to a `Symbol` +- `f` : a function which for each column selected by the `cols` keyword argument + takes the old name as a `String` + and returns the new name that gets converted to a `Symbol`; the `cols` + column selector can be any value accepted as column selector by the `names` function - `vals` : new column names as a vector of `Symbol`s or `AbstractString`s of the same length as the number of columns in `df` - `makeunique` : if `false` (the default), an error will be raised @@ -350,6 +358,14 @@ julia> rename(uppercase, df) │ Int64 Int64 Int64 ─────┼───────────────────── 1 │ 1 2 3 + +julia> rename(uppercase, df, cols=contains('x')) +1×3 DataFrame + Row │ i X y + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 2 3 + ``` """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; @@ -357,7 +373,7 @@ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...) -rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df)) +rename(f::Function, df::AbstractDataFrame; cols=All()) = rename!(f, copy(df); cols=cols) """ size(df::AbstractDataFrame[, dim]) diff --git a/src/other/index.jl b/src/other/index.jl index 51aa3a31c..ae9358d38 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -108,8 +108,6 @@ function rename!(x::Index, nms::AbstractVector{Pair{Symbol, Symbol}}) return x end -rename!(f::Function, x::Index) = rename!(x, [(n=>Symbol(f(string(n)))) for n in x.names]) - # we do not define keys on purpose; # use names to get keys as strings with copying # or _names to get keys as Symbols without copying diff --git a/test/dataframe.jl b/test/dataframe.jl index 971d7626d..fbc2ec0ca 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1112,6 +1112,19 @@ end df = DataFrame(A=1) asview && (df=view(df, :, :)) @test rename(x -> 1, df) == DataFrame(Symbol("1") => 1) + + for cols in (:B, Not("A"), Cols(2), Char, contains('B')) + df = DataFrame(A=1:3, B='A':'C') + asview && (df = view(df, :, :)) + @test names(rename(lowercase, df, cols=cols)) == ["A", "b"] + @test names(df) == ["A", "B"] + rename!(lowercase, df, cols=cols) + @test names(df) == ["A", "b"] + end + df = DataFrame(A=1:3, B='A':'C') + asview && (df = view(df, :, :)) + @test names(rename(lowercase, df, cols=[:A, :B])) == ["a", "b"] + @test names(rename(lowercase, df, cols=Not(:))) == ["A", "B"] end sdf = view(DataFrame(ones(2, 3), :auto), 1:2, 1:3) diff --git a/test/index.jl b/test/index.jl index fc82540ea..263b90bbb 100644 --- a/test/index.jl +++ b/test/index.jl @@ -50,7 +50,7 @@ using DataFrames: Index, SubIndex, fuzzymatch @test_throws ArgumentError i[Not(:x)] @test_throws ArgumentError i[Not("x")] @test_throws BoundsError i[Not(1:3)] - + @test i[Not([1, 1])] == [2] @test i[Not([:A, :A])] == [2] @test i[Not(["A", "A"])] == [2] @@ -84,10 +84,6 @@ end @test rename!(copy(i), [:a => :A]) == Index([:A, :b]) @test rename!(copy(i), [:a => :a]) == Index([:a, :b]) @test rename!(copy(i), [:a => :b, :b => :a]) == Index([:b, :a]) - @test rename!(x -> Symbol(uppercase(string(x))), copy(i)) == Index([:A, :B]) - @test rename!(x -> Symbol(lowercase(string(x))), copy(i)) == Index([:a, :b]) - @test rename!(uppercase, copy(i)) == Index([:A, :B]) - @test rename!(lowercase, copy(i)) == Index([:a, :b]) @test delete!(i, :a) == Index([:b]) push!(i, :C)