diff --git a/NEWS.md b/NEWS.md index 14b1f1fd1..f6d51b1f3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,9 @@ instead of using the interactive thread pool when Julia was started with `-tM,N` with N > 0 ([#3385](https://github.com/JuliaData/DataFrames.jl/pull/3385)) +* Correctly return `Bool[]` in the `nonunique` function applied to a data frame + with a pulled column that has zero levels in the pool + ([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393)) # DataFrames.jl v1.6.1 Release Notes diff --git a/src/abstractdataframe/unique.jl b/src/abstractdataframe/unique.jl index fd94caac6..9441b825c 100644 --- a/src/abstractdataframe/unique.jl +++ b/src/abstractdataframe/unique.jl @@ -87,7 +87,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first) if !(keep in (:first, :last, :noduplicates)) throw(ArgumentError("`keep` must be :first, :last, or :noduplicates")) end - ncol(df) == 0 && return Bool[] + nrow(df) == 0 && return Bool[] res = fill(true, nrow(df)) cols = ntuple(i -> df[!, i], ncol(df)) if keep == :first diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl index 1a58b7ec4..0e4b1d984 100644 --- a/src/groupeddataframe/utils.jl +++ b/src/groupeddataframe/utils.jl @@ -337,7 +337,11 @@ function row_group_slots!(cols::NTuple{N, AbstractVector}, nt = max(1, lg ÷ 100_000) end # if there are few rows per group limit the number of threads used - nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + if ngroups == 0 + nt = 1 + else + nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt) + end seen = fill(false, ngroups) seen_vec = Vector{Vector{Bool}}(undef, nt) diff --git a/test/duplicates.jl b/test/duplicates.jl index 61c01874d..d8c264962 100644 --- a/test/duplicates.jl +++ b/test/duplicates.jl @@ -1,6 +1,6 @@ module TestDuplicates -using Test, DataFrames, CategoricalArrays, Random +using Test, DataFrames, CategoricalArrays, Random, PooledArrays const ≅ = isequal @testset "nonunique" begin @@ -30,8 +30,8 @@ const ≅ = isequal @test_throws ArgumentError unique!(df) @test_throws ArgumentError unique(df, true) - pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), - b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) + pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]), + b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :) updf = DataFrame(a=CategoricalArray(["a", "a", missing, "b", missing]), b=CategoricalArray(["a", "b", missing, "b", "a"])) @test nonunique(pdf) == [false, false, false, true, false, false, true, true] @@ -39,6 +39,9 @@ const ≅ = isequal @test updf ≅ unique(pdf) @test_throws ArgumentError unique!(pdf) @test_throws ArgumentError unique(pdf, true) + + @test isempty(nonunique(DataFrame(a=PooledArray(Int[])))) + @test typeof(nonunique(DataFrame(a=PooledArray(Int[])))) === Vector{Bool} end @testset "nonunique, nonunique, unique! with extra argument" begin diff --git a/test/grouping.jl b/test/grouping.jl index df9e79bd3..a1283c656 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -4531,4 +4531,9 @@ end end end +@testset "no levels in pooled grouping bug #3393" begin + @test isempty(groupby_checked(DataFrame(x=PooledArray([missing])), :x, skipmissing=true)) + @test isempty(groupby_checked(DataFrame(x=categorical([missing])), :x, skipmissing=true)) +end + end # module