From a5e9c32e7cbe76b3bc3bf580a645ef987a0a1326 Mon Sep 17 00:00:00 2001 From: Josh Day Date: Wed, 12 Dec 2018 13:15:09 -0500 Subject: [PATCH] Tables Integration and transition to Union{T,Missing} - Fallback table constructor now uses Tables - Replaces DataValue with Union{T,Missing}, DataValueArray with Array{Union{T,Missing}} - dropna -> dropmissing - Added special selector Type, e.g. select(t, String) --- NEWS.md | 6 +++ README.md | 61 ++++++++++++++-------------- REQUIRE | 2 +- src/IndexedTables.jl | 19 +++++---- src/collect.jl | 7 +--- src/columns.jl | 36 ++++------------- src/{table.jl => indexedtable.jl} | 30 +++++--------- src/indexing.jl | 1 - src/join.jl | 48 +++++++++------------- src/ndsparse.jl | 4 +- src/reshape.jl | 25 ++++++------ src/selection.jl | 58 ++++++++++++++------------ src/tables.jl | 30 ++++++++++++++ src/tabletraits.jl | 16 ++------ src/utils.jl | 7 +--- test/runtests.jl | 19 ++++++--- test/test_collect.jl | 12 +++--- test/test_core.jl | 67 ++++++++++++++++++------------- test/test_join.jl | 31 ++++++++++++++ test/test_missing.jl | 13 ++++++ test/test_tables.jl | 14 +++++++ test/test_tabletraits.jl | 4 -- test/test_utils.jl | 2 - 23 files changed, 281 insertions(+), 231 deletions(-) rename src/{table.jl => indexedtable.jl} (96%) create mode 100644 src/tables.jl create mode 100644 test/test_join.jl create mode 100644 test/test_missing.jl create mode 100644 test/test_tables.jl diff --git a/NEWS.md b/NEWS.md index 6cde9f25..c254a252 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,3 +12,9 @@ - **(feature)** - `collect_columns` function to collect an iterator of tuples to `Columns` object. (#135) - **(bugfix)** use `collect_columns` to implement `map`, `groupreduce` and `groupjoin` (#150) to not depend on type inference. Works in many more cases. - **(feature)** - `view` works with logical indexes now (#134) + + +## v0.9.0 + +- **(breaking)** Switch from DataValues to Missing. Related: `dropna` has been changed to `dropmissing`. +- **(breaking)** Depend on OnlineStatsBase rather than OnlineStats. diff --git a/README.md b/README.md index 6392afa8..3b91aad1 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,28 @@ be used on its own for efficient in-memory data processing and analytics. ## Data Structures -- **The two table types in IndexedTables differ in how data is accessed.** -- **There is no performance difference between table types for operations such as selecting, filtering, and map/reduce.** +IndexedTables offers two data structures: `IndexedTable` and `NDSparse`. + +- **Both types store data _in columns_**. +- **`IndexedTable` and `NDSparse` differ mainly in how data is accessed.** +- **Both types have equal performance for Table operations (`select`, `filter`, etc.).** + + +## Quickstart + +``` +using Pkg +Pkg.add("IndexedTables") +using IndexedTables + +t = table((x = 1:100, y = randn(100))) + +select(t, :x) + +filter(row -> row.y > 0, t) +``` + +## `IndexedTable` vs. `NDSparse` First let's create some data to work with. @@ -22,18 +42,18 @@ city = vcat(fill("New York", 3), fill("Boston", 3)) dates = repeat(Date(2016,7,6):Day(1):Date(2016,7,8), 2) -values = [91, 89, 91, 95, 83, 76] +vals = [91, 89, 91, 95, 83, 76] ``` -### Table +### IndexedTable -- Data is accessed as a Vector of NamedTuples. -- Sorted by primary key(s), `pkey`. +- (Optionally) Sorted by primary key(s), `pkey`. +- Data is accessed as a Vector of NamedTuples. ```julia using IndexedTables -julia> t1 = table((city = city, dates = dates, values = values); pkey = [:city, :dates]) +julia> t1 = table((city = city, dates = dates, values = vals); pkey = [:city, :dates]) Table with 6 rows, 3 columns: city dates values ────────────────────────────── @@ -46,18 +66,15 @@ city dates values julia> t1[1] (city = "Boston", dates = 2016-07-06, values = 95) - -julia> first(t1) -(city = "Boston", dates = 2016-07-06, values = 95) ``` ### NDSparse -- Data is accessed as an N-dimensional sparse array with arbitrary indexes. - Sorted by index variables (first argument). +- Data is accessed as an N-dimensional sparse array with arbitrary indexes. ```julia -julia> t2 = ndsparse(@NT(city=city, dates=dates), @NT(value=values)) +julia> t2 = ndsparse((city=city, dates=dates), (value=vals,)) 2-d NDSparse with 6 values (1 field named tuples): city dates │ value ───────────────────────┼────── @@ -70,26 +87,8 @@ city dates │ value julia> t2["Boston", Date(2016, 7, 6)] (value = 95) - -julia> first(t2) -(value = 95) -``` - -As with other multi-dimensional arrays, dimensions can be permuted to change the sort order: - -```julia -julia> permutedims(t2, [2,1]) -2-d NDSparse with 6 values (1 field named tuples): -dates city │ value -───────────────────────┼────── -2016-07-06 "Boston" │ 95 -2016-07-06 "New York" │ 91 -2016-07-07 "Boston" │ 83 -2016-07-07 "New York" │ 89 -2016-07-08 "Boston" │ 76 -2016-07-08 "New York" │ 91 ``` ## Get started -For more information, check out the [JuliaDB API Reference](http://juliadb.org/latest/api/datastructures.html). +For more information, check out the [JuliaDB Documentation](http://juliadb.org/latest/index.html). diff --git a/REQUIRE b/REQUIRE index f6ce24fd..8bf6de22 100644 --- a/REQUIRE +++ b/REQUIRE @@ -5,4 +5,4 @@ WeakRefStrings 0.4.4 TableTraits 0.3.0 TableTraitsUtils 0.2.0 IteratorInterfaceExtensions 0.1.0 -DataValues +Tables \ No newline at end of file diff --git a/src/IndexedTables.jl b/src/IndexedTables.jl index 3081e067..9cbee999 100644 --- a/src/IndexedTables.jl +++ b/src/IndexedTables.jl @@ -4,13 +4,15 @@ using PooledArrays, SparseArrays, Statistics, WeakRefStrings, TableTraits, TableTraitsUtils, IteratorInterfaceExtensions using OnlineStatsBase: OnlineStat, fit! -using DataValues: DataValues, DataValue, NA, isna, DataValueArray -import DataValues: dropna +import Tables import Base: show, eltype, length, getindex, setindex!, ndims, map, convert, keys, values, ==, broadcast, empty!, copy, similar, sum, merge, merge!, mapslices, - permutedims, sort, sort!, iterate, pairs + permutedims, sort, sort!, iterate, pairs, reduce, push!, size, permute!, issorted, + sortperm, summary, resize!, vcat, append!, copyto!, view, tail, + tuple_type_cons, tuple_type_head, tuple_type_tail, in, convert + #-----------------------------------------------------------------------# exports export @@ -20,20 +22,20 @@ export AbstractNDSparse, All, ApplyColwise, Between, ColDict, Columns, IndexedTable, Keys, NDSparse, NextTable, Not, # functions - aggregate, aggregate!, aggregate_vec, antijoin, asofjoin, collect_columns, colnames, - column, columns, convertdim, dimlabels, dropna, flatten, flush!, groupby, groupjoin, + aggregate!, antijoin, asofjoin, collect_columns, colnames, + column, columns, convertdim, dimlabels, flatten, flush!, groupby, groupjoin, groupreduce, innerjoin, insertafter!, insertbefore!, insertcol, insertcolafter, insertcolbefore, leftgroupjoin, leftjoin, map_rows, naturalgroupjoin, naturaljoin, ncols, ndsparse, outergroupjoin, outerjoin, pkeynames, pkeys, popcol, pushcol, reducedim_vec, reindex, renamecol, rows, select, selectkeys, selectvalues, setcol, - stack, summarize, table, unstack, update!, where + stack, summarize, table, unstack, update!, where, dropmissing, dropna const Tup = Union{Tuple,NamedTuple} const DimName = Union{Int,Symbol} include("utils.jl") include("columns.jl") -include("table.jl") +include("indexedtable.jl") include("ndsparse.jl") include("collect.jl") @@ -73,7 +75,8 @@ include("flatten.jl") include("join.jl") include("reshape.jl") -# TableTraits.jl integration +# TableTraits/Tables integration include("tabletraits.jl") +include("tables.jl") end # module diff --git a/src/collect.jl b/src/collect.jl index 13f0377a..47166287 100644 --- a/src/collect.jl +++ b/src/collect.jl @@ -1,8 +1,5 @@ _is_subtype(::Type{S}, ::Type{T}) where {S, T} = promote_type(S, T) == T -dataarrayof(::Type{<:DataValue{T}}, len) where {T} = DataValueArray{T,1}(len) -dataarrayof(::Type{T}, len) where {T} = Vector{T}(undef, len) - """ collect_columns(itr) @@ -166,7 +163,7 @@ function widencolumns(dest, i, el::S, ::Type{T}) where{S <: Tup, T<:Tup} idx = findall(collect(!(s <: t) for (s, t) in zip(sp, tp))) new = dest for l in idx - newcol = dataarrayof(promote_type(sp[l], tp[l]), length(dest)) + newcol = Vector{promote_type(sp[l], tp[l])}(undef, length(dest)) copyto!(newcol, 1, column(dest, l), 1, i-1) new = setcol(new, l, newcol) end @@ -175,7 +172,7 @@ function widencolumns(dest, i, el::S, ::Type{T}) where{S <: Tup, T<:Tup} end function widencolumns(dest, i, el::S, ::Type{T}) where{S, T} - new = dataarrayof(promote_type(S, T), length(dest)) + new = Vector{promote_type(S, T)}(undef, length(dest)) copyto!(new, 1, dest, 1, i-1) new end diff --git a/src/columns.jl b/src/columns.jl index ee5365f2..9b443671 100644 --- a/src/columns.jl +++ b/src/columns.jl @@ -1,7 +1,3 @@ -import Base: - push!, size, sort, sort!, permute!, issorted, sortperm, - summary, resize!, vcat, append!, copyto!, view - """ Wrapper around a (named) tuple of Vectors that acts like a Vector of (named) tuples. @@ -97,7 +93,6 @@ available selection options and syntax. """ function columns end -columns(c) = error("no columns defined for $(typeof(c))") columns(c::Columns) = c.columns # Array-like API @@ -110,17 +105,14 @@ length(c::Columns{<:Pair, <:Pair}) = length(c.columns.first) ndims(c::Columns) = 1 """ -`ncols(itr)` + ncols(itr) Returns the number of columns in `itr`. # Examples - ncols([1,2,3]) - ncols(rows(([1,2,3],[4,5,6]))) - ncols(table(([1,2,3],[4,5,6]))) - ncols(table(@NT(x=[1,2,3],y=[4,5,6]))) - ncols(ndsparse(d, [7,8,9])) + ncols([1,2,3]) == 1 + ncols(rows(([1,2,3],[4,5,6]))) == 2 """ function ncols end ncols(c::Columns) = fieldcount(typeof(c.columns)) @@ -184,21 +176,7 @@ resize!(I::Columns, n::Int) = (foreach(c->resize!(c,n), I.columns); I) _sizehint!(c::Columns, n::Integer) = (foreach(c->_sizehint!(c,n), c.columns); c) -function ==(x::Columns, y::Columns) - nc = length(x.columns) - length(y.columns) == nc || return false - fieldnames(eltype(x)) == fieldnames(eltype(y)) || return false - n = length(x) - length(y) == n || return false - for i in 1:nc - x.columns[i] == y.columns[i] || return false - end - return true -end - -==(x::Columns{<:Pair}, y::Columns) = false -==(x::Columns, y::Columns{<:Pair}) = false -==(x::Columns{<:Pair}, y::Columns{<:Pair}) = (x.columns.first == y.columns.first) && (x.columns.second == y.columns.second) +==(x::Columns, y::Columns) = x.columns == y.columns function _strip_pair(c::Columns{<:Pair}) f, s = map(columns, c.columns) @@ -368,7 +346,7 @@ end # map """ -`map_rows(f, c...)` + map_rows(f, c...) Transform collection `c` by applying `f` to each element. For multiple collection arguments, apply `f` elementwise. Collect output as `Columns` if `f` returns @@ -449,7 +427,7 @@ struct Between{T1 <: Union{Int, Symbol}, T2 <: Union{Int, Symbol}} last::T2 end -const SpecialSelector = Union{Not, All, Keys, Between, Function, Regex} +const SpecialSelector = Union{Not, All, Keys, Between, Function, Regex, Type} hascolumns(t, s) = true hascolumns(t, s::Symbol) = s in colnames(t) @@ -458,6 +436,7 @@ hascolumns(t, s::Tuple) = all(hascolumns(t, x) for x in s) hascolumns(t, s::Not) = hascolumns(t, s.cols) hascolumns(t, s::Between) = hascolumns(t, s.first) && hascolumns(t, s.last) hascolumns(t, s::All) = all(hascolumns(t, x) for x in s.cols) +hascolumns(t, s::Type) = any(x -> eltype(x) <: s, columns(t)) lowerselection(t, s) = s lowerselection(t, s::Union{Int, Symbol}) = colindex(t, s) @@ -467,6 +446,7 @@ lowerselection(t, s::Keys) = lowerselection(t, IndexedTables.pkeyn lowerselection(t, s::Between) = Tuple(colindex(t, s.first):colindex(t, s.last)) lowerselection(t, s::Function) = colindex(t, Tuple(filter(s, collect(colnames(t))))) lowerselection(t, s::Regex) = lowerselection(t, x -> occursin(s, string(x))) +lowerselection(t, s::Type) = Tuple(findall(x -> eltype(x) <: s, columns(t))) function lowerselection(t, s::All) s.cols == () && return lowerselection(t, valuenames(t)) diff --git a/src/table.jl b/src/indexedtable.jl similarity index 96% rename from src/table.jl rename to src/indexedtable.jl index 405e0197..dab871ce 100644 --- a/src/table.jl +++ b/src/indexedtable.jl @@ -1,5 +1,3 @@ -import Base: setindex!, reduce - """ A permutation @@ -16,7 +14,7 @@ end abstract type AbstractIndexedTable end """ -A tabular data structure that extends [`Columns`](@ref). Create a `IndexedTable` with the +A tabular data structure that extends [`Columns`](@ref). Create an `IndexedTable` with the [`table`](@ref) function. """ struct IndexedTable{C<:Columns} <: AbstractIndexedTable @@ -51,7 +49,9 @@ Construct a table from a vector of tuples. See [`rows`](@ref) and [`Columns`](@r Copy a Table or NDSparse to create a new table. The same primary keys as the input are used. - table(iter; kw...) + table(x; kw...) + +Create an `IndexedTable` from any object `x` that follows the `Tables.jl` interface. # Keyword Argument Options: @@ -353,7 +353,7 @@ function sort!(t::IndexedTable, by...; kwargs...) end """ - excludecols(itr, cols) + excludecols(itr, cols) -> Tuple of Int Names of all columns in `itr` except `cols`. `itr` can be any of `Table`, `NDSparse`, `Columns`, or `AbstractVector` @@ -369,22 +369,10 @@ Names of all columns in `itr` except `cols`. `itr` can be any of excludecols(t, pkeynames(t)) excludecols([1,2,3], (1,)) """ -function excludecols(t, cols) - if cols isa SpecialSelector - return excludecols(t, lowerselection(t, cols)) - end - if !isa(cols, Tuple) - return excludecols(t, (cols,)) - end - ns = colnames(t) - mask = ones(Bool, length(ns)) - for c in cols - i = colindex(t, c) - if i !== 0 - mask[i] = false - end - end - ((1:length(ns))[mask]...,) +excludecols(t, cols) = excludecols(t, (cols,)) +excludecols(t, cols::SpecialSelector) = excludecols(t, lowerselection(t, cols)) +function excludecols(t, cols::Tuple) + Tuple(setdiff(1:length(colnames(t)), map(x -> colindex(t, x), cols))) end """ diff --git a/src/indexing.jl b/src/indexing.jl index c570d7d7..4a808cb0 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -19,7 +19,6 @@ _in(x, v::AbstractString) = x == v _in(x, v::Symbol) = x === v _in(x, v::Number) = isequal(x, v) -import Base: tail # test whether row r is within product(idxs...) @inline row_in(cs, r::Integer, idxs) = _row_in(cs[1], r, idxs[1], tail(cs), tail(idxs)) @inline _row_in(c1, r, i1, rI, ri) = _in(c1[r],i1) & _row_in(rI[1], r, ri[1], tail(rI), tail(ri)) diff --git a/src/join.jl b/src/join.jl index e6f97b1f..1ac36a2b 100644 --- a/src/join.jl +++ b/src/join.jl @@ -102,7 +102,7 @@ function _join!(::Val{typ}, ::Val{grp}, ::Val{keepkeys}, f, I, data, ks, lout, r # optimized push! method for concat_tup _push!(Val{:both}(), f, data, lout, rout, ldata, rdata, - lperm[x], rperm[y], NA, NA) + lperm[x], rperm[y], missing, missing) end end else @@ -167,9 +167,9 @@ function _join!(::Val{typ}, ::Val{grp}, ::Val{keepkeys}, f, I, data, ks, lout, r lnull_idx, rnull_idx end -nullrow(::Type{T}) where {T <: Tuple} = Tuple(fieldtype(T, i)() for i = 1:fieldcount(T)) -nullrow(::Type{NamedTuple{names, T}}) where {names, T} = NamedTuple{names, T}(Tuple(fieldtype(T, i)() for i = 1:fieldcount(T))) -nullrow(t::Type{<:DataValue}) = t() +nullrow(t::Type{<:Tuple}) = Tuple(map(x->missing, fieldtypes(t))) +nullrow(t::Type{<:NamedTuple}) = t(Tuple(map(x->missing, fieldtypes(t)))) +nullrow(t) = missing function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey, init_group, accumulate) lnull = nothing @@ -181,7 +181,7 @@ function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey, left_type = eltype(ldata) if !isa(typ, Union{Val{:left}, Val{:inner}, Val{:anti}}) - null_left_type = map_params(x->DataValue{x}, eltype(ldata)) + null_left_type = map_params(x -> Union{Missing, x}, eltype(ldata)) lnull = nullrow(null_left_type) else null_left_type = left_type @@ -189,7 +189,7 @@ function init_join_output(typ, grp, f, ldata, rdata, left, keepkeys, lkey, rkey, right_type = eltype(rdata) if !isa(typ, Val{:inner}) - null_right_type = map_params(x->DataValue{x}, eltype(rdata)) + null_right_type = map_params(x->Union{Missing, x}, eltype(rdata)) rnull = nullrow(null_right_type) else null_right_type = right_type @@ -343,18 +343,14 @@ function Base.join(f, left::Dataset, right::Dataset; lnulls[lnull_idx] .= true lout = if lout isa Columns Columns(map(lout.columns) do col - if col isa DataValueArray - col.isna[lnull_idx] .= true - else - DataValueArray(col, lnulls) - end - end) + v = convert(Vector{Union{Missing, eltype(col)}}, col) + v[lnull_idx] .= missing + v + end) else - if lout isa DataValueArray - lout.isna[lnull_idx] .= true - else - DataValueArray(lout, lnulls) - end + v = convert(Vector{Union{Missing, eltype(lout)}}, lout) + v[lnull_idx] .= missing + v end data = concat_cols(lout, rout) end @@ -364,18 +360,14 @@ function Base.join(f, left::Dataset, right::Dataset; rnulls[rnull_idx] .= true rout = if rout isa Columns Columns(map(rout.columns) do col - if col isa DataValueArray - col.isna[rnull_idx] .= true - else - DataValueArray(col, rnulls) - end - end) + v = convert(Vector{Union{Missing, eltype(col)}}, col) + v[rnull_idx] .= missing + v + end) else - if rout isa DataValueArray - rout.isna[rnull_idx] .= true - else - DataValueArray(rout, rnulls) - end + v = convert(Vector{Union{Missing, eltype(rout)}}, rout) + v[rnull_idx] .= missing + v end data = concat_cols(lout, rout) end diff --git a/src/ndsparse.jl b/src/ndsparse.jl index 2e11bc85..6109e431 100644 --- a/src/ndsparse.jl +++ b/src/ndsparse.jl @@ -271,8 +271,6 @@ function permutedims(t::NDSparse, p::AbstractVector) end # showing - -import Base.show function show(io::IO, t::NDSparse{T,D}) where {T,D} flush!(t) if !(values(t) isa Columns) @@ -391,7 +389,7 @@ end # aggregation """ -`aggregate!(f::Function, arr::NDSparse)` + aggregate!(f::Function, arr::NDSparse) Combine adjacent rows with equal indices using the given 2-argument reduction function, in place. diff --git a/src/reshape.jl b/src/reshape.jl index 832ca907..aa540c10 100644 --- a/src/reshape.jl +++ b/src/reshape.jl @@ -29,18 +29,6 @@ function stack(t::D, by = pkeynames(t); select = isa(t, NDSparse) ? valuenames(t convert(collectiontype(D), Columns(bycols), Columns(labelcol, valuecol, names = [variable, value])) end -function unstack(::Type{D}, ::Type{T}, key, val, cols::AbstractVector{S}) where {D <:Dataset, T, S} - dest_val = Columns((DataValues.DataValueArray{T}(length(val)) for i in cols)...; names = cols) - for (i, el) in enumerate(val) - for j in el - k, v = j - isna(columns(dest_val, S(k))[i]) || error("Repeated values with same label are not allowed") - columns(dest_val, S(k))[i] = v - end - end - convert(collectiontype(D), key, dest_val) -end - """ unstack(t, by = pkeynames(t); variable = :variable, value = :value) @@ -61,5 +49,16 @@ function unstack(t::D, by = pkeynames(t); variable = :variable, value = :value) S = eltype(colnames(t)) cols = S.(union(columns(t, variable))) T = eltype(columns(t, value)) - unstack(D, T isa Type{<:DataValue} ? eltype(T) : T, pkeys(tgrp), columns(tgrp, value), cols) + unstack(D, Base.nonmissingtype(T), pkeys(tgrp), columns(tgrp, value), cols) +end + +function unstack(::Type{D}, ::Type{T}, key, val, cols::AbstractVector{S}) where {D <:Dataset, T, S} + dest_val = Columns((Array{Union{T, Missing}}(undef, length(val)) for i in cols)...; names = cols) + for (i, el) in enumerate(val) + for (k, v) in el + ismissing(columns(dest_val, S(k))[i]) || error("Repeated values with same label are not allowed") + columns(dest_val, S(k))[i] = v + end + end + convert(collectiontype(D), key, dest_val) end diff --git a/src/selection.jl b/src/selection.jl index 4022ed4d..51ccd369 100644 --- a/src/selection.jl +++ b/src/selection.jl @@ -41,10 +41,20 @@ end getfield(columns(t), which) end +""" + selectkeys(x::NDSparse, sel) + +Return an `NDSparse` with a subset of keys. +""" function selectkeys(x::NDSparse, which; kwargs...) ndsparse(rows(keys(x), which), values(x); kwargs...) end +""" + selectvalues(x::NDSparse, sel) + +Return an `NDSparse` with a subset of values +""" function selectvalues(x::NDSparse, which; presorted=true, copy=false, kwargs...) ndsparse(keys(x), rows(values(x), which); presorted=presorted, copy=copy, kwargs...) end @@ -122,51 +132,47 @@ function map(f, t::Dataset; select=nothing, copy=false, kwargs...) isa(x, Columns) ? table(x; copy=false, kwargs...) : x end -function _nonna(t::Union{Columns, IndexedTable}, by=(colnames(t)...,)) - indxs = [1:length(t);] - if !isa(by, Tuple) - by = (by,) - end +function _non_missing(t::Union{Columns, IndexedTable}, sel=(colnames(t)...,)) + indxs = collect(1:length(t)) + by = isa(sel, Tuple) ? sel : (sel,) bycols = columns(t, by) d = ColDict(t) for (key, c) in zip(by, bycols) x = rows(t, c) - #filt_by_col!(!ismissing, x, indxs) - #if Missing <: eltype(x) - # y = Array{nonmissing(eltype(x))}(undef, length(x)) - # y[indxs] = x[indxs] - filt_by_col!(!isna, x, indxs) - if isa(x, Array{<:DataValue}) - y = Array{eltype(eltype(x))}(undef, length(x)) - y[indxs] = map(get, x[indxs]) - x = y - elseif isa(x, DataValueArray) - x = x.values # unsafe unwrap + if Missing <: eltype(x) + filt_by_col!(!ismissing, x, indxs) + y = Vector{Base.nonmissingtype(eltype(x))}(undef, length(x)) + y[indxs] = x[indxs] + d[key] = y + else + d[key] = x end - d[key] = x end (d[], indxs) end """ - dropna(t) - dropna(t, select) + dropmissing(t) + dropmissing(t, select) -Drop rows of table `t` which contain NA (`DataValues.DataValue`) values, optionally only +Drop rows of table `t` which contain `missing` values, optionally only using the columns in `select`. -Column types will be converted to non-NA types. E.g. `Array{DataValue{Int}}` to `Array{Int}`. +Column types will be converted to non-`Missing` types. E.g. `Array{Union{Int, Missing}}` +to `Array{Int}`. # Example - t = table([0.1,0.5,NA,0.7], [2,NA,4,5], [NA,6,NA,7], names=[:t,:x,:y]) - dropna(t) - dropna(t, (:t, :x)) + t = table([0.1,0.5,missing,0.7], [2,missing,4,5], [missing,6,missing,7], names=[:t,:x,:y]) + dropmissing(t) + dropmissing(t, (:t, :x)) """ -function dropna(t::Dataset, by=(colnames(t)...,)) - subtable(_nonna(t, by)...,) +function dropmissing(t::Dataset, by=colnames(t)) + subtable(_non_missing(t, by)...) end +@deprecate dropna dropmissing + filt_by_col!(f, col, indxs) = filter!(i->f(col[i]), indxs) """ diff --git a/src/tables.jl b/src/tables.jl new file mode 100644 index 00000000..eedb264b --- /dev/null +++ b/src/tables.jl @@ -0,0 +1,30 @@ +#-----------------------------------------------------------------------# Columns +const TableColumns = Columns{T} where {T<:NamedTuple} + +Columns(x; kw...) = Columns(Tables.columntable(x); kw...) + +Tables.istable(::Type{<:TableColumns}) = true +Tables.materializer(c::TableColumns) = Columns + +Tables.rowaccess(c::TableColumns) = true +Tables.rows(c::TableColumns) = c +Tables.schema(c::TableColumns) = Tables.Schema(colnames(c), Tuple(map(eltype, c.columns))) + +Tables.columnaccess(c::TableColumns) = true +Tables.columns(c::TableColumns) = c.columns +# Tables.schema already defined for NamedTuple of Vectors (c.columns) + +#-----------------------------------------------------------------------# IndexedTable +Tables.istable(::Type{IndexedTable{C}}) where {C<:TableColumns} = true +Tables.materializer(t::IndexedTable) = table +for f in [:rowaccess, :rows, :columnaccess, :columns, :schema] + @eval Tables.$f(t::IndexedTable) = Tables.$f(Columns(columns(t))) +end + +#-----------------------------------------------------------------------# NDSparse +# Tables.istable(::Type{NDSparse{T,D,C,V}}) where {T,D,C<:TableColumns,V<:TableColumns} = true +# Tables.materializer(t::NDSparse) = ndpsarse + + + + diff --git a/src/tabletraits.jl b/src/tabletraits.jl index b167a386..b043f0e1 100644 --- a/src/tabletraits.jl +++ b/src/tabletraits.jl @@ -4,16 +4,6 @@ function IteratorInterfaceExtensions.getiterator(source::NDSparse) return rows(source) end -function _array_factory(t,rows) - if isa(t, TypeVar) - return Array{Any}(undef, rows) - elseif t <: DataValue - return DataValueArray{eltype(t)}(rows) - else - return Array{t}(undef, rows) - end -end - function ndsparse(x; idxcols=nothing, datacols=nothing, copy=false, kwargs...) if isiterable(x) source_data = collect_columns(getiterator(x)) @@ -52,10 +42,10 @@ function table(rows::AbstractArray{T}; copy=false, kwargs...) where {T<:Union{Tu table(collect_columns(rows); copy=false, kwargs...) end -function table(iter; copy=false, kwargs...) +function table(iter; copy=false, kw...) if TableTraits.isiterable(iter) - table(collect_columns(getiterator(iter)); copy=false, kwargs...) + table(collect_columns(getiterator(iter)); copy=copy, kw...) else - throw(ArgumentError("iter cannot be turned into a IndexedTable.")) + table(Tables.columntable(iter); copy=copy, kw...) end end diff --git a/src/utils.jl b/src/utils.jl index 81fb0976..a2465847 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,6 +1,3 @@ -import Base: tuple_type_cons, tuple_type_head, tuple_type_tail, in, ==, isless, convert, - length, eltype, show - (T::Type{<:StringArray})(::typeof(undef), args...) = T(args...) fastmap(f, xs...) = map(f, xs...) @@ -161,7 +158,7 @@ function namedtuple(fields...) end """ -`arrayof(T)` + arrayof(T) Returns the type of `Columns` or `Vector` suitable to store values of type T. Nested tuples beget nested Columns. @@ -332,8 +329,6 @@ end compact_mem(x) = x compact_mem(x::StringArray{String}) = convert(StringArray{WeakRefString{UInt8}}, x) -#nonmissing(::Type{Union{Missing, T}}) where T = T - function getsubfields(n::NamedTuple, fields) fns = fieldnames(typeof(n)) NamedTuple{(fns[fields]...,)}(n) diff --git a/test/runtests.jl b/test/runtests.jl index 6bb401e2..d9c76734 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,11 +1,18 @@ -using Test, IndexedTables, OnlineStats, DataValues, WeakRefStrings -import DataValues: NA +using Test, IndexedTables, OnlineStats, WeakRefStrings, Tables, Random, Dates, + PooledArrays, SparseArrays, WeakRefStrings, LinearAlgebra, Statistics, + TableTraits, IteratorInterfaceExtensions, Serialization -@testset "IndexedTables" begin +using IndexedTables: excludecols, sortpermby, primaryperm, best_perm_estimate, hascolumns, + collect_columns_flattened +if VERSION < v"1.0-" + select = IndexedTables.select +end + +include("test_tables.jl") +include("test_missing.jl") +include("test_join.jl") include("test_core.jl") include("test_utils.jl") include("test_tabletraits.jl") -include("test_collect.jl") - -end +include("test_collect.jl") \ No newline at end of file diff --git a/test/test_collect.jl b/test/test_collect.jl index b40b8512..610f54c3 100644 --- a/test/test_collect.jl +++ b/test/test_collect.jl @@ -1,5 +1,3 @@ -using IndexedTables: collect_columns_flattened - @testset "collectnamedtuples" begin v = [(a = 1, b = 2), (a = 1, b = 3)] @test collect_columns(v) == Columns((a = Int[1, 1], b = Int[2, 3])) @@ -95,9 +93,9 @@ end tuple_itr = (exp(i) for i in itr) @test collect_columns(tuple_itr) == Float64[] - t = collect_columns((a = i,) for i in (1, DataValue{Int}(), 3)) - @test columns(t, 1) isa DataValueArray - @test isequal(columns(t, 1), DataValueArray([1, DataValue{Int}(), 3])) + t = collect_columns((a = i,) for i in (1, missing, 3)) + @test columns(t, 1) isa Vector{Union{Missing, Int}} + @test isequal(columns(t, 1), [1, missing, 3]) end @testset "collectpairs" begin @@ -126,8 +124,8 @@ end @test collect_columns(v) == Columns(Columns((a = Int[],))=>Columns((b = String[],))) @test eltype(collect_columns(v)) == Pair{NamedTuple{(:a,), Tuple{Int}}, NamedTuple{(:b,), Tuple{String}}} - t = table(collect_columns((b = 1,) => (a = i,) for i in (2, DataValue{Int}(), 3))) - @test t == table((b = [1,1,1], a = [2, DataValue{Int}(), 3]), pkey = :b) + t = table(collect_columns((b = 1,) => (a = i,) for i in (2, missing, 3))) + @test isequal(t, table((b = [1,1,1], a = [2, missing, 3]), pkey = :b)) end @testset "issubtype" begin diff --git a/test/test_core.jl b/test/test_core.jl index ce867b18..e4c17936 100644 --- a/test/test_core.jl +++ b/test/test_core.jl @@ -1,5 +1,4 @@ -using Test, Random, Dates, IndexedTables, PooledArrays, SparseArrays, WeakRefStrings, LinearAlgebra, Statistics -import IndexedTables: update!, pkeynames, pkeys, excludecols, sortpermby, primaryperm, best_perm_estimate, hascolumns, select + c = Columns([1,1,1,2,2], [1,2,4,3,5]) d = Columns([1,1,2,2,2], [1,3,1,4,5]) @@ -147,7 +146,7 @@ end @test collect(Base.pairs(a)) == [(1,1)=>10, (2,2)=>9, (2,3)=>8, (2,4)=>7] @test first(Base.pairs(a[:, 3])) == ((2,)=>8) - update!(x->x+10, a, 2, :) + IndexedTables.update!(x->x+10, a, 2, :) @test a == NDSparse([1,2,2,2], [1,2,3,4], [10,19,18,17]) a[2,2:3] = 77 @@ -638,11 +637,18 @@ end @test map(t -> (1,2), table(Int[])) == table(Int[], Int[]) end +@testset "Original Join Testset" begin l = table([1, 1, 2, 2], [1, 2, 1, 2], [1, 2, 3, 4], names=[:a, :b, :c], pkey=(:a, :b)) r = table([0, 1, 1, 3], [1, 1, 2, 2], [1, 2, 3, 4], names=[:a, :b, :d], pkey=(:a, :b)) @test join(l, r) == table([1, 1], [1, 2], [1, 2], [2, 3], names=Symbol[:a, :b, :c, :d]) - @test isequal(join(l, r, how=:left), table([1, 1, 2, 2], [1, 2, 1, 2], [1, 2, 3, 4], DataValueArray([2, 3, NA, NA]), names=Symbol[:a, :b, :c, :d])) - @test isequal(join(l, r, how=:outer), table([0, 1, 1, 2, 2, 3], [1, 1, 2, 1, 2, 2], DataValueArray([NA, 1, 2, 3, 4, NA]), DataValueArray([1, 2, 3, NA, NA, 4]), names=Symbol[:a, :b, :c, :d])) + @test isequal( + join(l, r, how=:left), + table((a=[1, 1, 2, 2], b=[1, 2, 1, 2], c=[1, 2, 3, 4], d=[2, 3, missing, missing])) + ) + @test isequal( + join(l, r, how=:outer), + table((a=[0, 1, 1, 2, 2, 3], b=[1, 1, 2, 1, 2, 2], c=[missing, 1, 2, 3, 4, missing], d=[1, 2, 3, missing, missing, 4])) + ) a = table([1],[2], names=[:x,:y]) b = table([1],[3], names=[:a,:b]) @test join(a, b, lkey=:x,rkey=:a) == table([1],[2],[3], names=[:x,:y,:b]) # issue JuliaDB.jl#105 @@ -651,11 +657,11 @@ end l1 = table([1, 2, 2, 3], [1, 2, 3, 4], names=[:x, :y]) r1 = table([2, 2, 3, 3], [5, 6, 7, 8], names=[:x, :z]) @test join(l1, r1, lkey=:x, rkey=:x) == table([2, 2, 2, 2, 3, 3], [2, 2, 3, 3, 4, 4], [5, 6, 5, 6, 7, 8], names=Symbol[:x, :y, :z]) - @test isequal(join(l, r, lkey=:a, rkey=:a, lselect=:b, rselect=:d, how=:outer), table([0, 1, 1, 1, 1, 2, 2, 3], DataValueArray([NA, 1, 1, 2, 2, 1, 2, NA]), DataValueArray([1, 2, 3, 2, 3, NA, NA, 4]), names=Symbol[:a, :b, :d])) + @test isequal(join(l, r, lkey=:a, rkey=:a, lselect=:b, rselect=:d, how=:outer), table([0, 1, 1, 1, 1, 2, 2, 3], [missing, 1, 1, 2, 2, 1, 2, missing], [1, 2, 3, 2, 3, missing, missing, 4], names=Symbol[:a, :b, :d])) t = table(["a","b","c","a"], [1,2,3,4]); t1 = table(["a","b"], [1,2]) - @test isequal(leftjoin(t,t1,lkey=1,rkey=1), table(["a","a","b","c"], [1,4,2,3], [1,1,2,NA])) + @test isequal(leftjoin(t,t1,lkey=1,rkey=1), table(["a","a","b","c"], [1,4,2,3], [1,1,2,missing])) t1 = table([1,2,3,4], [5,6,7,8], pkey=[1]) t2 = table([0,3,4,5], [5,6,7,8], pkey=[1]) @@ -680,35 +686,39 @@ end @test naturaljoin(c, a) == NDSparse([12,32], [52,34], Columns([0,1], [2,3], [11,150])) @test isequal( - leftjoin(t1, t2, lselect=2, rselect=2), - table([1,2,3,4], [5,6,7,8], [NA, NA, 6, 7])) + leftjoin(t1, t2, lselect=2, rselect=2), + table([1,2,3,4], [5,6,7,8], [missing, missing, 6, 7]) + ) # null instead of missing row - @test isequal(leftjoin(+, t1, t2, lselect=2, rselect=2), table([1,2,3,4], [NA, NA, 13, 15])) + @test isequal( + leftjoin(+, t1, t2, lselect=2, rselect=2), + table([1,2,3,4], [missing, missing, 13, 15]) + ) - @test isequal(leftjoin(t1, t2), table([1,2,3,4], [5,6,7,8], [NA, NA, 6,7])) - @test isequal(leftjoin(+, t1, t3, lselect=2, rselect=2), table([1,2,3,4,4],[NA,NA,13,15,16])) - @test isequal(leftjoin(+, t3, t4, lselect=2, rselect=2), table([0,3,4,4,4,4], [NA, 12, 14,15,15,16])) + @test isequal(leftjoin(t1, t2), table([1,2,3,4], [5,6,7,8], [missing, missing, 6,7])) + @test isequal(leftjoin(+, t1, t3, lselect=2, rselect=2), table([1,2,3,4,4],[missing,missing,13,15,16])) + @test isequal(leftjoin(+, t3, t4, lselect=2, rselect=2), table([0,3,4,4,4,4], [missing, 12, 14,15,15,16])) @test isequal(leftjoin(NDSparse([1,1,1,2], [2,3,4,4], [5,6,7,8]), NDSparse([1,1,3], [2,4,4], [9,10,12])), - NDSparse([1,1,1,2], [2,3,4,4], Columns([5, 6, 7, 8], [9, NA, 10, NA]))) + NDSparse([1,1,1,2], [2,3,4,4], Columns([5, 6, 7, 8], [9, missing, 10, missing]))) @test isequal( leftjoin(NDSparse([1,1,1,2], [2,3,4,4], [5,6,7,8]), NDSparse([1,1,2], [2,4,4], [9,10,12])), - NDSparse([1,1,1,2], [2,3,4,4], Columns([5, 6, 7, 8], [9, NA, 10, 12]))) + NDSparse([1,1,1,2], [2,3,4,4], Columns([5, 6, 7, 8], [9, missing, 10, 12]))) - @test isequal(outerjoin(t1, t2, lselect=2, rselect=2), table([0,1,2,3,4,5], [NA, 5,6,7,8,NA], [5,NA,NA,6,7,8])) + @test isequal(outerjoin(t1, t2, lselect=2, rselect=2), table([0,1,2,3,4,5], [missing, 5,6,7,8,missing], [5,missing,missing,6,7,8])) #showl instead of missing row - @test isequal(outerjoin(+, t1, t2, lselect=2, rselect=2), table([0,1,2,3,4,5], [NA, NA, NA, 13, 15, NA])) - - @test isequal(outerjoin(t1, t2), table([0,1,2,3,4,5], [NA, 5,6,7,8,NA], [5,NA,NA,6,7,8])) - @test isequal(outerjoin(+, t1, t3, lselect=2, rselect=2), table([0,1,2,3,4,4],[NA,NA,NA,13,15,16])) - @test isequal(outerjoin(+, t3, t4, lselect=2, rselect=2), table([0,1,3,4,4,4,4], [NA, NA, 12,14,15,15,16])) + @test isequal(outerjoin(+, t1, t2, lselect=2, rselect=2), table([0,1,2,3,4,5], [missing, missing, missing, 13, 15, missing])) + @test isequal(outerjoin(t1, t2), table([0,1,2,3,4,5], [missing, 5,6,7,8,missing], [5,missing,missing,6,7,8])) + @test isequal(outerjoin(+, t1, t3, lselect=2, rselect=2), table([0,1,2,3,4,4],[missing,missing,missing,13,15,16])) + @test isequal(outerjoin(+, t3, t4, lselect=2, rselect=2), table([0,1,3,4,4,4,4], [missing, missing, 12,14,15,15,16])) +end @testset "groupjoin" begin l = table([1, 1, 1, 2], [1, 2, 2, 1], [1, 2, 3, 4], names=[:a, :b, :c], pkey=(:a, :b)) r = table([0, 1, 1, 2], [1, 2, 2, 1], [1, 2, 3, 4], names=[:a, :b, :d], pkey=(:a, :b)) @@ -759,6 +769,8 @@ end @test select(t, Between(:x, :z)) == select(t, (:x, :y, :z)) @test select(t, i -> i == :y) == select(t, (:y,)) @test select(t, r"x|z") == select(t, (:x, :z)) + @test select(t, Int) == select(t, (:x, :z)) + @test select(t, String) == select(t, (:y,)) @test rows(t, Keys()) == rows(t, (:x,)) @test rows(t, (Keys(), :y)) == rows(t, ((:x,), :y)) @@ -796,13 +808,12 @@ end @test hascolumns(t, i -> i == :y) @test hascolumns(t, r"x|z") end - - t = table([0.1, 0.5, NA, 0.7], [2, NA, 4, 5], [NA, 6, NA, 7], names=[:t, :x, :y]) - @test dropna(t) == table([0.7], [5], [7], names=Symbol[:t, :x, :y]) - @test isequal(dropna(t, :y), table([0.5, 0.7], [NA, 5], [6, 7], names=Symbol[:t, :x, :y])) - t1 = dropna(t, (:t, :x)) - @test typeof(column(dropna(t, :x), :x)) == Array{Int,1} - +@testset "dropmissing" begin + t = table([0.1, 0.5, missing, 0.7], [2, missing, 4, 5], [missing, 6, missing, 7], names=[:t, :x, :y]) + @test dropmissing(t) == table([0.7], [5], [7], names=Symbol[:t, :x, :y]) + @test isequal(dropmissing(t, :y), table([0.5, 0.7], [missing, 5], [6, 7], names=Symbol[:t, :x, :y])) + @test typeof(column(dropmissing(t, :x), :x)) == Array{Int,1} +end @testset "filter" begin t = table(["a", "b", "c"], [0.01, 0.05, 0.07], [2, 1, 0], names=[:n, :t, :x]) @test filter((p->p.x / p.t < 100), t) == table(["b", "c"], [0.05, 0.07], [1, 0], names=Symbol[:n, :t, :x]) diff --git a/test/test_join.jl b/test/test_join.jl new file mode 100644 index 00000000..b544b40e --- /dev/null +++ b/test/test_join.jl @@ -0,0 +1,31 @@ +@testset "Test Joins" begin + y = rand(10) + z = rand(10) + + t = table((x=1:10, y=y), pkey=:x) + t2 = table((x=1:2:20, z=z), pkey=:x) + + @testset "how = :inner" begin + t_inner = table((x = 1:2:9, y = y[1:2:9], z = z[1:5]), pkey = :x) + @test isequal(join(t, t2; how=:inner), t_inner) + end + @testset "how = :left" begin + z_left = Union{Float64,Missing}[missing for i in 1:10] + z_left[1:2:9] = z[1:5] + t_left = table((x = 1:10, y = y, z = z_left)) + @test isequal(join(t, t2; how=:left), t_left) + end + @testset "how = :outer" begin + x_outer = union(1:10, 1:2:20) + y_outer = vcat(y, fill(missing, 5)) + z_left = Union{Float64,Missing}[missing for i in 1:10] + z_left[1:2:9] = z[1:5] + z_outer = vcat(z_left, z[6:10]) + t_outer = table((x=x_outer, y=y_outer, z=z_outer); pkey=:x) + @test isequal(join(t, t2; how=:outer), t_outer) + end + @testset "how = :anti" begin + t_anti = table((x=2:2:10, y=y[2:2:10]), pkey=:x) + @test isequal(join(t, t2; how=:anti), t_anti) + end +end \ No newline at end of file diff --git a/test/test_missing.jl b/test/test_missing.jl new file mode 100644 index 00000000..aa5ac355 --- /dev/null +++ b/test/test_missing.jl @@ -0,0 +1,13 @@ +@testset "Missing" begin + @testset "Table equality with missing" begin + @test ismissing(table([1, 2, missing]) == table([1, 2, missing])) + @test isequal(table([1,2,missing]), table([1,2,missing])) + @test ismissing(ndsparse([1], [missing]) == ndsparse([1], [missing])) + @test isequal(ndsparse([1], [missing]), ndsparse([1], [missing])) + @test !isequal(ndsparse([2], [missing]), ndsparse([1], [missing])) + end + @testset "stack/unstack" begin + t = table(1:4, [1, missing, 9, 16], [1, 8, 27, missing], names = [:x, :x2, :x3], pkey = :x) + @test isequal(t, unstack(stack(t))) + end +end \ No newline at end of file diff --git a/test/test_tables.jl b/test/test_tables.jl new file mode 100644 index 00000000..c0a91b2a --- /dev/null +++ b/test/test_tables.jl @@ -0,0 +1,14 @@ + + + +@testset "Tables Interface" begin + n = 1000 + x, y, z = 1:n, rand(Bool, n), randn(n) + + t = table((x=x, y=y, z=z), pkey=[:x, :y]) + + @test Tables.istable(t) + # @test t == table(Tables.rowtable((x=x,y=y,z=z))) + @test Tables.istable(columns(t)) + @test Tables.istable(Columns(columns(t))) +end \ No newline at end of file diff --git a/test/test_tabletraits.jl b/test/test_tabletraits.jl index a4c55be0..d21e16d0 100644 --- a/test/test_tabletraits.jl +++ b/test/test_tabletraits.jl @@ -1,7 +1,3 @@ -using IndexedTables -using TableTraits, IteratorInterfaceExtensions -using Test - @testset "TableTraits" begin source_nds = NDSparse(Columns(a=[1,2,3]), Columns(b=[1.,2.,3.], c=["A","B","C"])) diff --git a/test/test_utils.jl b/test/test_utils.jl index ff5fbdbd..264758aa 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -1,5 +1,3 @@ -using Test, IndexedTables, Serialization - let a = NDSparse([12,21,32], [52,41,34], [11,53,150]), b = NDSparse([12,23,32], [52,43,34], [56,13,10]) p = vec(collect(IndexedTables.product(a, b))) @test p == [(11,56), (11,13), (11,10), (53,56), (53,13), (53,10), (150,56), (150,13), (150,10)]