From 831f010acb5343f16c3729902cb8e5670f654061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 21 Oct 2023 19:29:39 +0200 Subject: [PATCH 01/13] fix nonunique bug (#3393) --- NEWS.md | 3 +++ src/abstractdataframe/unique.jl | 2 +- src/groupeddataframe/utils.jl | 6 +++++- test/duplicates.jl | 9 ++++++--- test/grouping.jl | 5 +++++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 14b1f1fd16..f6d51b1f32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,9 @@ instead of using the interactive thread pool when Julia was started with `-tM,N` with N > 0 ([#3385](https://github.com/JuliaData/DataFrames.jl/pull/3385)) +* Correctly return `Bool[]` in the `nonunique` function applied to a data frame + with a pulled column that has zero levels in the pool + ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393)) # DataFrames.jl v1.6.1 Release Notes diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index fd94caac66..9441b825c0 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -87,7 +87,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) if !(keep in (:first, :last, :noduplicates)) throw(ArgumentError("`keep` must be :first, :last, or :noduplicates")) end - ncol(df) == 0 && return Bool[] + nrow(df) == 0 && return Bool[] res = fill(true, nrow(df)) cols = ntuple(i -> df[!, i], ncol(df)) if keep == :first diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 1a58b7ec44..0e4b1d9840 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -337,7 +337,11 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, nt = max(1, lg ÷ 100_000) end # if there are few rows per group limit the number of threads used - nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + if ngroups == 0 + nt = 1 + else + nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + end seen = fill(false, ngroups) seen_vec = Vector{Vector{Bool}}(undef, nt) diff --git a/test/duplicates.jl b/test/duplicates.jl index 61c01874d2..d8c264962b 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -1,6 +1,6 @@ module TestDuplicates -using Test, DataFrames, CategoricalArrays, Random +using Test, DataFrames, CategoricalArrays, Random, PooledArrays const ≅ = isequal @testset "nonunique" begin @@ -30,8 +30,8 @@ const ≅ = isequal @test_throws ArgumentError unique!(df) @test_throws ArgumentError unique(df, true) - pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), - b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) + pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), + b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) updf = DataFrame(a=CategoricalArray(["a", "a", missing, "b", missing]), b=CategoricalArray(["a", "b", missing, "b", "a"])) @test nonunique(pdf) == [false, false, false, true, false, false, true, true] @@ -39,6 +39,9 @@ const ≅ = isequal @test updf ≅ unique(pdf) @test_throws ArgumentError unique!(pdf) @test_throws ArgumentError unique(pdf, true) + + @test isempty(nonunique(DataFrame(a=PooledArray(Int[])))) + @test typeof(nonunique(DataFrame(a=PooledArray(Int[])))) === Vector{Bool} end @testset "nonunique, nonunique, unique! with extra argument" begin diff --git a/test/grouping.jl b/test/grouping.jl index df9e79bd35..a1283c6568 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4531,4 +4531,9 @@ end end end +@testset "no levels in pooled grouping bug #3393" begin + @test isempty(groupby_checked(DataFrame(x=PooledArray([missing])), :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=categorical([missing])), :x, skipmissing=true)) +end + end # module From 8c32d537b1d5a2ae9b1cdd72928def18aeae0bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 21 Oct 2023 23:06:35 +0200 Subject: [PATCH 02/13] remove unnecessary @time (#3394) --- test/join.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/join.jl b/test/join.jl index 478cca98d3..0453d3b63f 100644 --- a/test/join.jl +++ b/test/join.jl @@ -84,7 +84,7 @@ anti = left[Bool[ismissing(x) for x in left.Job], [:ID, :Name]] @test_throws ArgumentError crossjoin(df1, df2, renamecols=(x -> "a") => x -> "a") @test crossjoin(df1, df2, renamecols=(x -> "a") => x -> "a", makeunique=true) == rename(cross, [:a, :a_1, :a_2]) - + # Cross joins handle naming collisions @test size(crossjoin(df1, df1, makeunique=true)) == (4, 4) @@ -2176,7 +2176,7 @@ end @test_throws ArgumentError outerjoin(df1, df2, on=:x, order=:x) end -@time @testset "randomized join tests with sort" begin +@testset "randomized join tests with sort" begin Random.seed!(1234) for lenl in 0:20, lenr in 0:20, rep in 1:10 df1 = DataFrame(x=rand(0:lenl, lenl), id1=1:lenl) @@ -2221,7 +2221,7 @@ end @testset "wide joins" begin Random.seed!(1234) # we need many repetitions to make sure we cover all cases - @time for _ in 1:1000, k in 2:4 + for _ in 1:1000, k in 2:4 dfs = [(n=rand(10:20); DataFrame("id" => randperm(n), "x$i" => 1:n)) for i in 1:4] @test issorted(innerjoin(dfs..., on="id", order=:left)[:, 2]) @@ -2232,9 +2232,9 @@ end dfs = [DataFrame("id" => 0, "x$i" => i) for i in 1:10000] res = innerjoin(dfs..., on="id") - @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) + @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) res = outerjoin(dfs..., on="id") - @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) + @test res == DataFrame(["id" => 0; ["x$i" => i for i in 1:10000]]) end end # module From 6a450f8db5835867a36dc7055c1d5209e0a831db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 13 Nov 2023 11:05:28 +0100 Subject: [PATCH 03/13] fix first and last for negative row count (#3402) --- NEWS.md | 3 +++ src/abstractdataframe/abstractdataframe.jl | 18 ++++++++++++++---- test/dataframe.jl | 14 ++++++++++---- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index f6d51b1f32..b5ad2ff692 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,9 @@ ## Bug fixes +* Correctly throw an error if negative number of rows is passed + to `first` or `last` + ([#3402](https://github.com/JuliaData/DataFrames.jl/pull/3402)) * Always use the default thread pool for multithreaded operations, instead of using the interactive thread pool when Julia was started with `-tM,N` with N > 0 diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a812365ee9..2b99ce4623 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -558,14 +558,19 @@ Base.first(df::AbstractDataFrame) = df[1, :] first(df::AbstractDataFrame, n::Integer; view::Bool=false) Get a data frame with the `n` first rows of `df`. +Get all rows if `n` is greater than the number of rows in `df`. +Error if `n` is negative. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. $METADATA_FIXED """ -@inline Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) = - view ? Base.view(df, 1:min(n ,nrow(df)), :) : df[1:min(n, nrow(df)), :] +@inline function Base.first(df::AbstractDataFrame, n::Integer; view::Bool=false) + n < 0 && throw(ArgumentError("Number of elements must be nonnegative")) + r = min(n, nrow(df)) + return view ? Base.view(df, 1:r, :) : df[1:r, :] +end """ last(df::AbstractDataFrame) @@ -580,14 +585,19 @@ Base.last(df::AbstractDataFrame) = df[nrow(df), :] last(df::AbstractDataFrame, n::Integer; view::Bool=false) Get a data frame with the `n` last rows of `df`. +Get all rows if `n` is greater than the number of rows in `df`. +Error if `n` is negative. If `view=false` a freshly allocated `DataFrame` is returned. If `view=true` then a `SubDataFrame` view into `df` is returned. $METADATA_FIXED """ -@inline Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) = - view ? Base.view(df, max(1, nrow(df)-n+1):nrow(df), :) : df[max(1, nrow(df)-n+1):nrow(df), :] +@inline function Base.last(df::AbstractDataFrame, n::Integer; view::Bool=false) + n < 0 && throw(ArgumentError("Number of elements must be nonnegative")) + r = max(1, nrow(df) - n + 1) + return view ? Base.view(df, r:nrow(df), :) : df[r:nrow(df), :] +end """ describe(df::AbstractDataFrame; cols=:) diff --git a/test/dataframe.jl b/test/dataframe.jl index fbc2ec0ca8..7efa1ca48a 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1180,10 +1180,16 @@ end @test_throws BoundsError first(DataFrame(x=[])) @test_throws BoundsError last(DataFrame(x=[])) - @test first(df, 6) == DataFrame(A=1:6) - @test first(df, 1) == DataFrame(A=1) - @test last(df, 6) == DataFrame(A=5:10) - @test last(df, 1) == DataFrame(A=10) + for v in (true, false) + @test first(df, 6, view=v) == DataFrame(A=1:6) + @test first(df, 1, view=v) == DataFrame(A=1) + @test first(df, 0, view=v) == DataFrame(A=Int[]) + @test_throws ArgumentError first(df, -1, view=v) + @test last(df, 6, view=v) == DataFrame(A=5:10) + @test last(df, 1, view=v) == DataFrame(A=10) + @test last(df, 0, view=v) == DataFrame(A=Int[]) + @test_throws ArgumentError last(df, -1, view=v) + end @inferred first(df, 6) @inferred last(df, 6) From 1b8830dd6ec059a7af2bcc6d18e5b493696c85f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 14 Dec 2023 20:34:12 +0100 Subject: [PATCH 04/13] add example of using Tables.dictcolumntable (#3387) --- docs/src/man/basics.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/docs/src/man/basics.md b/docs/src/man/basics.md index 9ddede8cf5..d1962262b4 100644 --- a/docs/src/man/basics.md +++ b/docs/src/man/basics.md @@ -175,6 +175,40 @@ julia> DataFrame([(a=1, b=0), (a=2, b=0)]) 2 │ 2 0 ``` +Sometimes your source data might have a heterogeneous set of columns for each observation. +Here is an example: + +``` +julia> source = [(type="circle", radius=10), (type="square", side=20)] +2-element Vector{NamedTuple{names, Tuple{String, Int64}} where names}: + (type = "circle", radius = 10) + (type = "square", side = 20) +``` + +If you want to create a data frame from such data containing all columns present in at least +one of the source observations, with a `missing` entry if some column is not present then +you can use `Tables.dictcolumntable` function to help you create the desired data frame: + +``` +julia> DataFrame(Tables.dictcolumntable(source)) +2×3 DataFrame + Row │ type radius side + │ String Int64? Int64? +─────┼────────────────────────── + 1 │ circle 10 missing + 2 │ square missing 20 +``` + +The role of `Tables.dictcolumntable` is to make sure that the `DataFrame` constructor gets information +about all columns present in the source data and properly instantiates them. If we did not use +this function the `DataFrame` constructor would assume that the first row of data contains the set +of columns present in the source, which would lead to an error in our example: + +``` +julia> DataFrame(source) +ERROR: type NamedTuple has no field radius +``` + Let us finish our review of constructors by showing how to create a `DataFrame` from a matrix. In this case you pass a matrix as a first argument. If the second argument is just `:auto` then column names `x1`, `x2`, ... will be auto generated. From 3e290274d3c201e8bfe903f0d326e78c38fc0fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 8 Jan 2024 10:23:32 +0100 Subject: [PATCH 05/13] Fix eachrow and eachcol indexing with CartesianIndex (#3413) --- NEWS.md | 3 +++ src/abstractdataframe/iteration.jl | 3 +++ test/iteration.jl | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/NEWS.md b/NEWS.md index b5ad2ff692..a454a7793c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,9 @@ * Correctly return `Bool[]` in the `nonunique` function applied to a data frame with a pulled column that has zero levels in the pool ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393)) +* Correctly index `eachrow` and `eachcol` with `CartesianIndex` + ([#3413](https://github.com/JuliaData/DataFrames.jl/issues/3413)) + # DataFrames.jl v1.6.1 Release Notes diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index c81228fb10..6f6abc6208 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -81,6 +81,7 @@ Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear() Base.size(itr::DataFrameRows) = (size(parent(itr), 1), ) Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::Int) = parent(itr)[i, :] +Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::CartesianIndex{1}) = itr[i[1]] Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, idx) = eachrow(@view parent(itr)[idx isa AbstractVector && !(eltype(idx) <: Bool) ? copy(idx) : idx, :]) @@ -263,6 +264,8 @@ Base.iterate(itr::DataFrameColumns, i::Integer=1) = i <= length(itr) ? (itr[i], i + 1) : nothing Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) = parent(itr)[!, idx] +Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::CartesianIndex{1}) = + itr[idx[1]] Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::MultiColumnIndex) = eachcol(parent(itr)[!, idx]) Base.:(==)(itr1::DataFrameColumns, itr2::DataFrameColumns) = diff --git a/test/iteration.jl b/test/iteration.jl index 4c1b9d0d1d..249677a02c 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -15,6 +15,8 @@ using Test, DataFrames @test sprint(summary, eachrow(df)) == "2-element DataFrameRows" @test Base.IndexStyle(eachrow(df)) == IndexLinear() @test eachrow(df)[1] == DataFrameRow(df, 1, :) + @test eachrow(df)[CartesianIndex(1)] == DataFrameRow(df, 1, :) + @test_throws MethodError eachrow(df)[CartesianIndex(1, 1)] @test collect(eachrow(df)) isa Vector{<:DataFrameRow} @test eltype(eachrow(df)) <: DataFrameRow for row in eachrow(df) @@ -35,6 +37,8 @@ using Test, DataFrames @test_throws ArgumentError size(eachcol(df), 2) @test_throws ArgumentError size(eachcol(df), 0) @test eachcol(df)[1] == df[:, 1] + @test eachcol(df)[CartesianIndex(1)] == df[:, 1] + @test_throws MethodError eachcol(df)[CartesianIndex(1, 1)] @test eachcol(df)[:A] === df[!, :A] @test eachcol(df)[All()] == eachcol(df) @test eachcol(df)[Cols(:)] == eachcol(df) From d45c99aad368ec365e04a45edd2f4c0514d3440e Mon Sep 17 00:00:00 2001 From: Yuto Horikawa Date: Fri, 19 Jan 2024 18:11:15 +0900 Subject: [PATCH 06/13] Update for Documenter.jl v1 and Julia v1.10 (#3416) --- docs/Project.toml | 2 +- docs/make.jl | 8 ++++---- docs/src/man/getting_started.md | 4 ++-- src/abstractdataframe/iteration.jl | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index ebe348a76b..f6a9f940ec 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -11,4 +11,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Documenter = "0.27" +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index fa64782dac..a5d5a4f4e3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,7 +16,8 @@ makedocs( format = Documenter.HTML( canonical = "https://juliadata.github.io/DataFrames.jl/stable/", assets = ["assets/favicon.ico"], - edit_link = "main" + edit_link = "main", + size_threshold_ignore = ["man/basics.md", "lib/functions.md"], ), pages = Any[ "Introduction" => "index.md", @@ -42,11 +43,10 @@ makedocs( hide("Internals" => "lib/internals.md"), ] ], - strict = true ) -# Deploy built documentation from Travis. -# ======================================= +# Deploy built documentation. +# =========================== deploydocs( # options diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 7e1194f565..b13b6b1efc 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -443,7 +443,7 @@ A particular common case of a collection that supports the a vector of `NamedTuple`s: ```jldoctest dataframe julia> v = [(a=1, b=2), (a=3, b=4)] -2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}: +2-element Vector{@NamedTuple{a::Int64, b::Int64}}: (a = 1, b = 2) (a = 3, b = 4) @@ -460,7 +460,7 @@ You can also easily convert a data frame back to a vector of `NamedTuple`s: julia> using Tables julia> Tables.rowtable(df) -2-element Vector{NamedTuple{(:a, :b), Tuple{Int64, Int64}}}: +2-element Vector{@NamedTuple{a::Int64, b::Int64}}: (a = 1, b = 2) (a = 3, b = 4) ``` diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 6f6abc6208..22589d3c87 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -57,7 +57,7 @@ julia> eachrow(df) 4 │ 4 14 julia> copy.(eachrow(df)) -4-element Vector{NamedTuple{(:x, :y), Tuple{Int64, Int64}}}: +4-element Vector{@NamedTuple{x::Int64, y::Int64}}: (x = 1, y = 11) (x = 2, y = 12) (x = 3, y = 13) From a107659e10c2684cc942a87ab4c5617c81da62fc Mon Sep 17 00:00:00 2001 From: Yuto Horikawa Date: Sun, 21 Jan 2024 07:29:30 +0900 Subject: [PATCH 07/13] Update docs on Juliacon (#3420) --- docs/src/index.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 66ed6f3e5f..64a943a06f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,12 +12,13 @@ other packages you can check-out the following resources * [Data Wrangling with DataFrames.jl Cheat Sheet](https://www.ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/) * [DataFrames Tutorial using Jupyter Notebooks](https://github.com/bkamins/Julia-DataFrames-Tutorial/) * [Julia Academy DataFrames.jl tutorial](https://github.com/JuliaAcademy/DataFrames) -* [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial), - [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial), - [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial), +* [JuliaCon 2023](https://github.com/bkamins/JuliaCon2023-Tutorial), [JuliaCon 2022](https://github.com/bkamins/JuliaCon2022-DataFrames-Tutorial), - [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020), - and [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials + [JuliaCon 2021](https://github.com/bkamins/JuliaCon2021-DataFrames-Tutorial), + [JuliaCon 2020](https://github.com/bkamins/JuliaCon2020-DataFrames-Tutorial), + [JuliaCon 2019](https://github.com/bkamins/JuliaCon2019-DataFrames-Tutorial), + [ODSC Europe 2021](https://github.com/bkamins/ODSC-EUROPE-2021) tutorials, + and [PyData Global 2020](https://github.com/bkamins/PyDataGlobal2020) * [DataFrames.jl showcase](https://github.com/bkamins/DataFrames-Showcase) If you prefer to learn DataFrames.jl from a book you can consider reading: From 1683da5e3d99b77f67939bafa8197e1f8334e180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 25 Jan 2024 19:24:51 +0100 Subject: [PATCH 08/13] Change big to BigInt calls (#3419) --- NEWS.md | 2 ++ src/abstractdataframe/abstractdataframe.jl | 2 +- src/dataframe/dataframe.jl | 3 +-- src/groupeddataframe/utils.jl | 2 +- src/join/core.jl | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index a454a7793c..75c4cc5a14 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393)) * Correctly index `eachrow` and `eachcol` with `CartesianIndex` ([#3413](https://github.com/JuliaData/DataFrames.jl/issues/3413)) +* Correctly handle non-standard integers when converting them to `BigInt` + ([#3419](https://github.com/JuliaData/DataFrames.jl/issues/3419)) # DataFrames.jl v1.6.1 Release Notes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 2b99ce4623..e8f4e32ed0 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1486,7 +1486,7 @@ function fillcombinations(df::AbstractDataFrame, indexcols; end # make sure we do not overflow in the target data frame size - target_rows = Int(prod(x -> big(length(x)), uniquevals)) + target_rows = Int(prod(x -> BigInt(length(x)), uniquevals)) if iszero(target_rows) @assert iszero(nrow(df)) cdf = copy(df) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3f4afafecf..b9a496ba9e 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1546,7 +1546,7 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) @assert length(colvalues) == length(colnames) @assert all(x -> x isa AbstractVector, colvalues) - target_rows = Int(prod(x -> big(length(x)), colvalues)) + target_rows = Int(prod(x -> BigInt(length(x)), colvalues)) out_df = DataFrame() inner = 1 for (val, cname) in zip(colvalues, colnames) @@ -1563,4 +1563,3 @@ function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) end _try_select_no_copy(df::DataFrame, cols) = select(df, cols, copycols=false) - diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 0e4b1d9840..c73e0730c3 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -156,7 +156,7 @@ function refpool_and_array(x::AbstractArray) else minval, maxval = extrema(x) end - ngroups = big(maxval) - big(minval) + 1 + ngroups = BigInt(maxval) - BigInt(minval) + 1 # Threshold chosen with the same rationale as the row_group_slots! refpool method: # refpool approach is faster but we should not allocate too much memory either # We also have to avoid overflow, including with ngroups + 1 for missing values diff --git a/src/join/core.jl b/src/join/core.jl index 87d94d8fef..89f89c989d 100644 --- a/src/join/core.jl +++ b/src/join/core.jl @@ -328,7 +328,7 @@ function _innerjoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}} right::AbstractVector{<:Union{Integer, Missing}}) minv, maxv = extrema_missing(right) - val_range = big(maxv) - big(minv) + val_range = BigInt(maxv) - BigInt(minv) if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _innerjoin_unsorted(left, right) @@ -648,7 +648,7 @@ function _semijoin_unsorted_int(left::AbstractVector{<:Union{Integer, Missing}}, right_shorter::Bool) minv, maxv = extrema_missing(right) - val_range = big(maxv) - big(minv) + val_range = BigInt(maxv) - BigInt(minv) if val_range > typemax(Int) - 3 || val_range ÷ 2 > max(64, length(right)) || minv < typemin(Int) + 2 || maxv > typemax(Int) - 3 return _semijoin_unsorted(left, right, seen_rows, right_shorter) From d7f27e6695c147ca065901f607cdf26559bea602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 25 Jan 2024 19:26:05 +0100 Subject: [PATCH 09/13] Import groupby from DataAPI, remove by and aggregate (#3422) --- NEWS.md | 5 +++++ Project.toml | 2 +- src/DataFrames.jl | 1 + src/deprecated.jl | 8 -------- test/deprecated.jl | 5 ----- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 75c4cc5a14..3b74a91e27 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,6 +31,11 @@ * Correctly handle non-standard integers when converting them to `BigInt` ([#3419](https://github.com/JuliaData/DataFrames.jl/issues/3419)) +## Removed deprecations + +* The `by` and `aggregate` functions that were deprecated before 1.0 + release are now removed. + ([#3422](https://github.com/JuliaData/DataFrames.jl/issues/3422)) # DataFrames.jl v1.6.1 Release Notes diff --git a/Project.toml b/Project.toml index df31d9fb62..f76100bdfa 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" CategoricalArrays = "0.10.0" Combinatorics = "1.0.2" Compat = "4.2" -DataAPI = "1.15.0" +DataAPI = "1.16.0" DataStructures = "0.18" DataValues = "0.4.13" InlineStrings = "1.3.0" diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 57809cbdb9..debd309f5c 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -20,6 +20,7 @@ import DataAPI, DataAPI.Between, DataAPI.Cols, DataAPI.describe, + DataAPI.groupby, DataAPI.innerjoin, DataAPI.outerjoin, DataAPI.rightjoin, diff --git a/src/deprecated.jl b/src/deprecated.jl index 19839c54a6..c93bc70d5d 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -1,11 +1,3 @@ -export by, aggregate - -# TODO: remove definitions in 2.0 release -by(args...; kwargs...) = throw(ArgumentError("by function was removed from DataFrames.jl. " * - "Use the `combine(groupby(...), ...)` or `combine(f, groupby(...))` instead.")) -aggregate(args...; kwargs...) = throw(ArgumentError("aggregate function was removed from DataFrames.jl. " * - "Use the `combine` function instead.")) - # TODO: remove deprecation in 2.0 release import Base.delete! @deprecate delete!(df::DataFrame, inds) deleteat!(df::DataFrame, inds) \ No newline at end of file diff --git a/test/deprecated.jl b/test/deprecated.jl index beaba2770b..7a09015b7b 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -4,11 +4,6 @@ using Test, DataFrames, CategoricalArrays const ≅ = isequal -@testset "by and aggregate" begin - @test_throws ArgumentError by() - @test_throws ArgumentError aggregate() -end - @testset "indicator in joins" begin name = DataFrame(ID=[1, 2, 3], Name=["John Doe", "Jane Doe", "Joe Blogs"]) job = DataFrame(ID=[1, 2, 4], Job=["Lawyer", "Doctor", "Farmer"]) From 3b4fcd8fac6a7df395144a4eb1037ca72f9fcaf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 5 Apr 2024 23:09:13 +0200 Subject: [PATCH 10/13] disambiguate allunique signature (#3434) --- NEWS.md | 6 ++++++ src/abstractdataframe/unique.jl | 5 +++++ test/dataframe.jl | 1 + 3 files changed, 12 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3b74a91e27..22d2045f06 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,6 +37,12 @@ release are now removed. ([#3422](https://github.com/JuliaData/DataFrames.jl/issues/3422)) +## Julia compatibility change + +* Ensure that `allunique(::AbstractDataFrame, ::Any)` always gets + interpreted as test for uniqueness of rows in the first positional argument + ([#3434](https://github.com/JuliaData/DataFrames.jl/issues/3434)) + # DataFrames.jl v1.6.1 Release Notes ## Bug fixes diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index 9441b825c0..9e91d6a533 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -207,6 +207,11 @@ function Base.allunique(df::AbstractDataFrame, cols=:) Val(false), nothing, false, nothing, true)[1] == nrow(df) end +# avoid invoking Base.allunique(f, iterator) introduced in Julia 1.11 + +Base.allunique(df::AbstractDataFrame, cols::Tuple) = + invoke(Base.allunique, Tuple{AbstractDataFrame, Any}, df, cols) + """ unique(df::AbstractDataFrame; view::Bool=false, keep::Symbol=:first) unique(df::AbstractDataFrame, cols; view::Bool=false, keep::Symbol=:first) diff --git a/test/dataframe.jl b/test/dataframe.jl index 7efa1ca48a..940590852e 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -2331,6 +2331,7 @@ end @test allunique(df, []) @test allunique(df, x -> 1:4) @test allunique(df, [:a, :b] => ByRow(string)) + @test_throws ArgumentError allunique(df, ()) end end From 8c1d98e56cd13ca5bfbbb6772fd438209d23f1a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 10 Apr 2024 20:07:22 +0200 Subject: [PATCH 11/13] do not pass empty vector to Tables.columntable (#3435) --- NEWS.md | 3 +++ src/abstractdataframe/selection.jl | 7 ++++++- test/select.jl | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 22d2045f06..1d7fd30504 100644 --- a/NEWS.md +++ b/NEWS.md @@ -42,6 +42,9 @@ * Ensure that `allunique(::AbstractDataFrame, ::Any)` always gets interpreted as test for uniqueness of rows in the first positional argument ([#3434](https://github.com/JuliaData/DataFrames.jl/issues/3434)) +* Make sure that an empty vector of `Any` or of `AbstractVector` is treated as having + no columns when a data frame is being processed with `combine`/`select`/`transform`. + ([#3435](https://github.com/JuliaData/DataFrames.jl/issues/3435)) # DataFrames.jl v1.6.1 Release Notes diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index a90f1203d0..81dae2da95 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -822,7 +822,12 @@ function select_transform!((nc,)::Ref{Any}, df::AbstractDataFrame, newdf::DataFr res = newres elseif !(res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix, Tables.AbstractRow}) - res = Tables.columntable(res) + if res isa Union{AbstractVector{Any}, AbstractVector{<:AbstractVector}} + @assert isempty(res) + res = DataFrame() + else + res = Tables.columntable(res) + end end end diff --git a/test/select.jl b/test/select.jl index 67f97df2fa..3a8ad3b23b 100644 --- a/test/select.jl +++ b/test/select.jl @@ -3024,4 +3024,22 @@ end @test_throws ArgumentError combine(gdf, :x => (x -> x[1] == 2 ? "x" : cr) => AsTable) end +@testset "empty vector" begin + df = DataFrame(a=1:3) + + @test_throws ArgumentError select(df, :a => (x -> Vector{Any}[])) + + for T in (Vector{Any}, Any, NamedTuple{(:x,),Tuple{Int64}}) + v = combine(df, :a => (x -> T[])).a_function + @test isempty(v) + @test eltype(v) === T + end + + @test size(combine(df, :a => (x -> Vector{Any}[]) => AsTable)) == (0, 0) + @test size(combine(df, :a => (x -> Any[]) => AsTable)) == (0, 0) + df2 = combine(df, :a => (x -> NamedTuple{(:x,),Tuple{Int64}}[]) => AsTable) + @test size(df2) == (0, 1) + @test eltype(df2.x) === Int +end + end # module From fcf761241f26917f9abf17ac46b6e7ed07bf3835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 19 Apr 2024 09:48:58 +0200 Subject: [PATCH 12/13] Explain the role of querying frameworks for DataFrames.jl (#3438) --- docs/src/man/querying_frameworks.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md index 47799c5d52..6eadb2b8ec 100644 --- a/docs/src/man/querying_frameworks.md +++ b/docs/src/man/querying_frameworks.md @@ -5,6 +5,9 @@ DataFramesMeta.jl, DataFrameMacros.jl and Query.jl. They implement a functionali [dplyr](https://dplyr.tidyverse.org/) or [LINQ](https://en.wikipedia.org/wiki/Language_Integrated_Query). +These frameworks are designed both to make it easier for new users to start working with data frames in Julia +and to allow advanced users to write more compact code. + ## DataFramesMeta.jl The [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) package From 027650418ab08bbe6b94f2cf42743839aa7a593e Mon Sep 17 00:00:00 2001 From: Nathan Boyer <65452054+nathanrboyer@users.noreply.github.com> Date: Fri, 19 Apr 2024 03:50:21 -0400 Subject: [PATCH 13/13] Typo fix (#3439) --- docs/src/man/querying_frameworks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md index 6eadb2b8ec..abda7ec6f4 100644 --- a/docs/src/man/querying_frameworks.md +++ b/docs/src/man/querying_frameworks.md @@ -33,7 +33,7 @@ pipe the output of one transformation as an input to another, as with Below we present several selected examples of usage of the package. First we subset rows of the source data frame using a logical condition -and select its two columns, renaming one of them: +and select two of its columns, renaming one of them: ```jldoctest dataframesmeta julia> using DataFramesMeta