diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index 8b86a93..0000000 --- a/.appveyor.yml +++ /dev/null @@ -1,30 +0,0 @@ -environment: - matrix: - - julia_version: 1.3 - - julia_version: nightly - -platform: - - x86 # 32-bit - - x64 # 64-bit - -branches: - only: - - master - - /release-.*/ - -notifications: - - provider: Email - on_build_success: false - on_build_failure: false - on_build_status_changed: false - -install: - - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) - -build_script: - - echo "%JL_BUILD_SCRIPT%" - - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" - -test_script: - - echo "%JL_TEST_SCRIPT%" - - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" diff --git a/Project.toml b/Project.toml index 8ac709d..e58ad5a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Impute" uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575" authors = ["Invenia Technical Computing"] -version = "0.5.1" +version = "0.6.0" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" diff --git a/src/Impute.jl b/src/Impute.jl index 7a10090..2d6e158 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -324,32 +324,4 @@ julia> Impute.srs(df; rng=MersenneTwister(1234), context=Context(; limit=1.0)) ``` """ srs -""" - svd!(data::AbstractMatrix; limit=1.0) - -Utility method for `impute!(data, :svd; limit=limit)` -""" -svd!(data::AbstractMatrix; limit=1.0) = impute!(data, :svd; limit=limit) - -""" - svd(data::AbstractMatrix; limit=1.0) - -Utility method for `impute(data, :svd; limit=limit)` -""" -svd(data::AbstractMatrix; limit=1.0) = impute(data, :svd; limit=limit) - -""" - knn!(data::AbstractMatrix; limit=1.0) - -Utility method for `impute!(data, :knn; limit=limit)` -""" -knn!(data::AbstractMatrix; limit=1.0) = impute!(data, :knn; limit=limit) - -""" - knn(data::AbstractMatrix; limit=1.0) - -Utility method for `impute(data, :knn; limit=limit)` -""" -knn(data::AbstractMatrix; limit=1.0) = impute(data, :knn; limit=limit) - end # module diff --git a/src/deprecated.jl b/src/deprecated.jl index 105c3b4..e69de29 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -1,186 +0,0 @@ -############################################################################### -# Deprecations for calling impute on an Imputor with a custom AbstractContext # -############################################################################### -Base.@deprecate( - impute(imp::Imputor, context::AbstractContext, data; kwargs...), - impute(data, typeof(imp)(; context=context, kwargs...)) -) - -Base.@deprecate( - impute!(imp::Imputor, context::AbstractContext, data; kwargs...), - impute!(data, typeof(imp)(; context=context, kwargs...)) -) - -Base.@deprecate impute(imp::Imputor, data) impute(data, imp) -Base.@deprecate impute!(imp::Imputor, data) impute!(data, imp) - -##################################################################### -# Deprecate all impute calls where the first argument is an Imputor # -##################################################################### -""" - impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) - -Looks up the `Imputor` type for the `method`, creates it and calls -`impute!(data, imputor::Imputor)` with it. - -# Arguments -* `data`: the datset containing missing elements we should impute. -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data, method::Symbol, args...; limit::Float64=0.1) - Base.depwarn( - """ - impute!(data, method) is deprecated. - Please use Impute.method!(data) or impute!(data, imputor::Imputor). - """, - :impute! - ) - imputor_type = imputation_methods[method] - imputor = if length(args) > 0 - imputor_type(args...; context=Context(; limit=limit)) - else - imputor_type(; context=Context(; limit=limit)) - end - - return impute!(data, imputor) -end - -""" - impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) - -Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call -`impute!(data, imputor::Imputor)` with them. - -# Arguments -* `data`: the datset containing missing elements we should impute. -* `missing::Function`: the missing data function to use -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) - Base.depwarn( - """ - impute!(data, missing, method) is deprecated. Please use impute!(data, imputor::Imputor). - """, - :impute! - ) - imputor_type = imputation_methods[method] - imputor = if length(args) > 0 - imputor_type(args...; context=Context(; is_missing=missing, limit=limit)) - else - imputor_type(; context=Context(; is_missing=missing, limit=limit)) - end - - return impute!(data, imputor) -end - -""" - impute(data, args...; kwargs...) - -Copies the `data` before calling `impute!(new_data, args...; kwargs...)` -""" -function impute(data, args...; kwargs...) - Base.depwarn( - """ - impute(data, args...; kwargs...) is deprecated. - Please use Impute.method(data) or impute(data, imputor::Imputor). - """, - :impute - ) - # Call `deepcopy` because we can trust that it's available for all types. - return impute!(deepcopy(data), args...; kwargs...) -end - -################################# -# Deprecate the chain functions # -################################# -""" - chain!(data, missing::Function, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` -""" -function chain!(data, missing::Function, imputors::Imputor...; kwargs...) - Base.depwarn( - """ - chain!(data, missing, imputors...) is deprecated. - Please use data = imp1(data) |> imp2 |> imp3 - """, - :chain! - ) - return chain!(data, imputors...; is_missing=missing, kwargs...) -end - -""" - chain!(data, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(data, imputor)` -""" -function chain!(data, imputors::Imputor...; kwargs...) - Base.depwarn( - """ - chain!(data, imputors...) is deprecated. - Please use data = imp1(data) |> imp2 |> imp3 - """, - :chain! - ) - ctx = Context(; kwargs...) - - for imputor in imputors - imp = typeof(imputor)( - (isa(x, AbstractContext) ? ctx : x for x in fieldvalues(imputor))... - ) - data = impute!(data, imp) - end - - return data -end - -""" - chain(data, args...; kwargs...) - -Copies the `data` before calling `chain!(data, args...; kwargs...)` -""" -function chain(data, args...; kwargs...) - Base.depwarn( - """ - chain(data, args...) is deprecated. - Please use result = imp1(data) |> imp2 |> imp3 - """, - :chain - ) - # Call `deepcopy` because we can trust that it's available for all types. - return chain!(deepcopy(data), args...; kwargs...) -end - -##################### -# Misc Deprecations # -##################### -Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) -Base.@deprecate_binding Drop DropObs false - -# This function is just used to support legacy behaviour and should be removed in a -# future release when we dropping accepting the limit kwarg to impute functions. -function _extract_context_kwargs(kwargs...) - d = Dict{Symbol, Any}(kwargs...) - limit = 1.0 - - if haskey(d, :limit) - limit = d[:limit] - @warn( - "Passing `limit` directly to impute functions is deprecated. " * - "Please pass `context=Context(; limit=$limit)` in the future." - ) - delete!(d, :limit) - end - - if !haskey(d, :context) - d[:context] = Context(; limit=limit) - end - - return d -end diff --git a/src/imputors.jl b/src/imputors.jl index 78dafa3..6beb1e0 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -54,12 +54,12 @@ end # Some utility methods for constructing imputors and imputing data in 1 call. # NOTE: This is only intended for internal use and is not part of the public API. function _impute(data, t::Type{T}, kwargs...) where T <: Imputor - imp, rem = splitkwargs(t, _extract_context_kwargs(kwargs...)...) + imp, rem = splitkwargs(t, kwargs...) return impute(data, imp; rem...) end function _impute!(data, t::Type{T}, kwargs...) where T <: Imputor - imp, rem = splitkwargs(t, _extract_context_kwargs(kwargs...)...) + imp, rem = splitkwargs(t, kwargs...) return impute!(data, imp; rem...) end diff --git a/test/deprecated.jl b/test/deprecated.jl index 961d86e..e69de29 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -1,174 +0,0 @@ -@testset "deprecated" begin - a = allowmissing(1.0:1.0:20.0) - a[[2, 3, 7]] .= missing - mask = map(!ismissing, a) - - @testset "Drop" begin - result = impute(a, :drop; limit=0.2) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) - - @test result == expected - - # Mutating method - a2 = copy(a) - Impute.drop!(a2; limit=0.2) - @test a2 == expected - end - - @testset "Interpolate" begin - result = impute(a, :interp; limit=0.2) - @test result == collect(1.0:1.0:20) - @test result == interp(a) - - # Test in-place method - a2 = copy(a) - Impute.interp!(a2; limit=0.2) - @test a2 == result - - # Test interpolation between identical points - b = ones(Union{Float64, Missing}, 20) - b[[2, 3, 7]] .= missing - @test interp(b) == ones(Union{Float64, Missing}, 20) - - # Test interpolation at endpoints - b = ones(Union{Float64, Missing}, 20) - b[[1, 3, 20]] .= missing - result = interp(b) - @test ismissing(result[1]) - @test ismissing(result[20]) - end - - @testset "Fill" begin - @testset "Value" begin - fill_val = -1.0 - result = impute(a, :fill, fill_val; limit=0.2) - expected = copy(a) - expected[[2, 3, 7]] .= fill_val - - @test result == expected - end - - @testset "Mean" begin - result = impute(a, :fill; limit=0.2) - expected = copy(a) - expected[[2, 3, 7]] .= mean(a[mask]) - - @test result == expected - - a2 = copy(a) - Impute.fill!(a2; limit=0.2) - @test a2 == result - end - end - - @testset "LOCF" begin - result = impute(a, :locf; limit=0.2) - expected = copy(a) - expected[2] = 1.0 - expected[3] = 1.0 - expected[7] = 6.0 - - @test result == expected - a2 = copy(a) - impute!(a2, :locf; limit=0.2) - @test a2 == result - end - - @testset "NOCB" begin - result = impute(a, :nocb; limit=0.2) - expected = copy(a) - expected[2] = 4.0 - expected[3] = 4.0 - expected[7] = 8.0 - - @test result == expected - a2 = copy(a) - Impute.nocb!(a2; limit=0.2) - @test a2 == result - end - - @testset "DataFrame" begin - data = dataset("boot", "neuro") - df = impute(data, :interp; limit=1.0) - end - - @testset "Matrix" begin - data = Matrix(dataset("boot", "neuro")) - - @testset "Drop" begin - result = Iterators.drop(data) - @test size(result, 1) == 4 - end - - @testset "Fill" begin - result = impute(data, :fill, 0.0; limit=1.0) - @test size(result) == size(data) - end - end - - @testset "Not enough data" begin - @test_throws ImputeError impute(a, :drop) - end - - @testset "Chain" begin - orig = dataset("boot", "neuro") - - @testset "DataFrame" begin - result = chain( - orig, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) - - @test size(result) == size(orig) - # Confirm that we don't have any more missing values - @test all(!ismissing, Matrix(result)) - end - - @testset "Column Table" begin - data = Tables.columntable(orig) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) |> Tables.matrix - - @test size(result) == size(orig) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - - @testset "Matrix" begin - data = Matrix(orig) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test all(!ismissing, result) - end - end - - @testset "Alternate missing functions" begin - data1 = dataset("boot", "neuro") # Missing values with `missing` - data2 = impute(data1, :fill, NaN; limit=1.0) # Missing values with `NaN` - - @test impute(data1, :drop; limit=1.0) == dropmissing(data1) - - result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) - result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) - @test result1 == result2 - - @test Impute.drop(data1; limit=1.0) == impute(data2, isnan, :drop; limit=1.0) - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 5f2f8bb..4fbc049 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,7 +17,6 @@ using Impute using Impute: Impute, Imputor, - Drop, DropObs, DropVars, Interpolate, @@ -28,6 +27,8 @@ using Impute: Context, WeightedContext, ImputeError, + impute, + impute!, interp, chain @@ -414,19 +415,18 @@ end @test result == result3 @testset "GroupedDataFrame" begin - hod = repeat(1:24, 12 * 10) - obj = repeat(1:12, 24 * 10) - n = length(hod) - - df = DataFrame( - :hod => hod, - :obj => obj, - :val => allowmissing( - [sin(x) * cos(y) for (x, y) in zip(hod, obj)] - ), - ) - - df.val[rand(1:n, 20)] .= missing + T = NamedTuple{(:hod, :obj, :val), Tuple{Int, Int, Union{Float64, Missing}}} + + df = map(Iterators.product(1:24, 1:8, 0:19)) do t + hod, obj, x = t + # Deterministically return some `missing`s per hod/obj pair + return if x in (0, 5, 12, 19) + T((hod, obj, missing)) + else + T((hod, obj, sin(hod) * cos(x) + obj)) + end + end |> DataFrame + gdf1 = groupby(deepcopy(df), [:hod, :obj]) gdf2 = groupby(df, [:hod, :obj]) @@ -434,13 +434,19 @@ end f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() result = mapreduce(f1, vcat, gdf1) + # Check that the result isn't the same as the source dataframe @test df != result - @test size(result) == (24 * 12 * 10, 3) + # Check that the size is still the same since we didn't drop any rows + @test size(result) == size(df) + # Check that there are no remaining missing values @test all(!ismissing, Tables.matrix(result)) + # Double check that our source dataframe still contains missings + @test any(ismissing, Tables.matrix(df)) # Test that we can also mutate the dataframe directly map(f2, gdf2) - @test result == sort(df, (:hod, :obj)) + # Now we can check that we've replaced all the missing values in df + @test all(!ismissing, Tables.matrix(df)) end end @@ -706,7 +712,7 @@ end X = add_missings(data') svd_imputed = Impute.svd(X) - mean_imputed = impute(copy(X), :fill; limit=1.0) + mean_imputed = Impute.fill(copy(X)) # With sufficient correlation between the variables and enough observation we # expect the svd imputation to perform severl times better than mean imputation. @@ -720,7 +726,7 @@ end X = add_missings(data) svd_imputed = Impute.svd(X) - mean_imputed = impute(copy(X), :fill; limit=1.0) + mean_imputed = Impute.fill(copy(X)) # If we don't have enough variables then SVD imputation will probably perform # about as well as mean imputation. @@ -733,7 +739,7 @@ end X = add_missings(data) svd_imputed = Impute.svd(X) - mean_imputed = impute(copy(X), :fill; limit=1.0) + mean_imputed = Impute.fill(copy(X)) # If most of the variance in the original data can't be explained by a small # subset of the eigen values in the svd decomposition then our low rank approximations diff --git a/test/testutils.jl b/test/testutils.jl index bf5dcd1..f0d8350 100644 --- a/test/testutils.jl +++ b/test/testutils.jl @@ -231,19 +231,19 @@ end function test_groupby(tester::ImputorTester) @testset "GroupBy" begin - hod = repeat(1:24, 12 * 10) - obj = repeat(1:12, 24 * 10) - n = length(hod) - - df = DataFrame( - :hod => hod, - :obj => obj, - :val => allowmissing( - [sin(x) * cos(y) for (x, y) in zip(hod, obj)] - ), - ) + T = NamedTuple{(:hod, :obj, :val), Tuple{Int, Int, Union{Float64, Missing}}} + + rows = map(Iterators.product(1:24, 1:8, 0:19)) do t + hod, obj, x = t + # Deterministically return some `missing`s per hod/obj pair + return if x in (0, 5, 12, 19) + T((hod, obj, missing)) + else + T((hod, obj, sin(hod) * cos(x) + obj)) + end + end - df.val[rand(1:n, 20)] .= missing + df = DataFrame(rows) # Deleting variables in a groupby doesn't really make sense if tester.imp != DropVars @@ -251,12 +251,17 @@ function test_groupby(tester::ImputorTester) @test !isequal(df, result) if tester.imp == DropObs - @test size(result) == (24 * 12 * 10 - 20, 3) + # If we've dropped some observations then we should get back + # all, but the 4 missing observations per 24 hods and 8 objs. + @test size(result) == (24 * 8 * 16, 3) else - @test size(result) == (24 * 12 * 10, 3) + @test size(result) == size(df) end - @test count(ismissing, Tables.matrix(result)) < 20 + # Test that we successfully imputed something. + # We expect LOCF and NOCB to leave `missing`s at the start and end of each + # group respectively. + @test count(ismissing, Tables.matrix(result)) < count(ismissing, df.val) @test isequal( mapreduce(tester.f!, vcat, groupby(deepcopy(df), [:hod, :obj])), result