Skip to content

Commit 19e13b9

Browse files
committed
define pushfirst! for data sets
1 parent 803f632 commit 19e13b9

File tree

2 files changed

+330
-0
lines changed

2 files changed

+330
-0
lines changed

src/dataset/del_and_append.jl

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,3 +639,331 @@ function Base.push!(ds::Dataset, row::Any; promote::Bool=false)
639639
_reset_grouping_info!(ds)
640640
ds
641641
end
642+
643+
644+
function Base.pushfirst!(ds::Dataset, row::Union{AbstractDict, NamedTuple};
645+
cols::Symbol=:setequal,
646+
promote::Bool=(cols in [:union, :subset]))
647+
# push keep formats
648+
possible_cols = (:orderequal, :setequal, :intersect, :subset, :union)
649+
if !(cols in possible_cols)
650+
throw(ArgumentError("`cols` keyword argument must be any of :" *
651+
join(possible_cols, ", :")))
652+
end
653+
654+
nrows, ncols = size(ds)
655+
targetrows = nrows + 1
656+
# here the formats should be kept, setproperty! modifies time
657+
if ncols == 0 && row isa NamedTuple
658+
for (n, v) in pairs(row)
659+
format_of_cur_col = getformat(ds, n)
660+
setproperty!(ds, n, fill!(allocatecol(typeof(v), 1), v))
661+
setformat!(ds, n => format_of_cur_col)
662+
end
663+
_reset_grouping_info!(ds)
664+
return ds
665+
end
666+
667+
old_row_type = typeof(row)
668+
if row isa AbstractDict && keytype(row) !== Symbol &&
669+
(keytype(row) <: AbstractString || all(x -> x isa AbstractString, keys(row)))
670+
row = (;(Symbol.(keys(row)) .=> values(row))...)
671+
end
672+
673+
# in the code below we use a direct access to _columns because
674+
# we resize the columns so temporarily the `Dataset` is internally
675+
# inconsistent and normal data set indexing would error.
676+
if cols == :union
677+
current_modified = _attributes(ds).meta.modified[]
678+
if row isa AbstractDict && keytype(row) !== Symbol && !all(x -> x isa Symbol, keys(row))
679+
throw(ArgumentError("when `cols == :union` all keys of row must be Symbol"))
680+
end
681+
for (i, colname) in enumerate(_names(ds))
682+
format_of_cur_col = getformat(ds, colname)
683+
col = _columns(ds)[i]
684+
if haskey(row, colname)
685+
val = row[colname]
686+
else
687+
val = missing
688+
end
689+
S = typeof(val)
690+
T = eltype(col)
691+
if S <: T || promote_type(S, T) <: T
692+
pushfirst!(col, val)
693+
elseif !promote
694+
try
695+
pushfirst!(col, val)
696+
catch err
697+
setformat!(ds, colname => format_of_cur_col)
698+
for col in _columns(ds)
699+
resize!(col, nrows)
700+
end
701+
_attributes(ds).meta.modified[] = current_modified
702+
@error "Error adding value to column :$colname."
703+
rethrow(err)
704+
end
705+
else
706+
newcol = similar(col, promote_type(S, T), targetrows)
707+
copyto!(newcol, 2, col, 1, nrows)
708+
newcol[1] = val
709+
firstindex(newcol) != 1 && _onebased_check_error()
710+
_columns(ds)[i] = newcol
711+
setformat!(ds, colname => format_of_cur_col)
712+
end
713+
end
714+
for (colname, col) in zip(_names(ds), _columns(ds))
715+
if length(col) != targetrows
716+
for col2 in _columns(ds)
717+
resize!(col2, nrows)
718+
end
719+
_attributes(ds).meta.modified[] = current_modified
720+
throw(AssertionError("Error adding value to column :$colname"))
721+
end
722+
end
723+
for colname in setdiff(keys(row), _names(ds))
724+
val = row[colname]
725+
S = typeof(val)
726+
if nrows == 0
727+
newcol = [val]
728+
else
729+
newcol = allocatecol(Union{Missing, S}, targetrows)
730+
fill!(newcol, missing)
731+
newcol[1] = val
732+
end
733+
ds[!, colname] = newcol
734+
end
735+
_modified(_attributes(ds))
736+
_reset_grouping_info!(ds)
737+
return ds
738+
end
739+
740+
if cols == :orderequal
741+
if old_row_type <: Dict
742+
throw(ArgumentError("passing `Dict` as `row` when `cols == :orderequal` " *
743+
"is not allowed as it is unordered"))
744+
elseif length(row) != ncol(ds) || any(x -> x[1] != x[2], zip(keys(row), _names(ds)))
745+
throw(ArgumentError("when `cols == :orderequal` pushed row must " *
746+
"have the same column names and in the " *
747+
"same order as the target data set"))
748+
end
749+
elseif cols === :setequal
750+
# Only check for equal lengths if :setequal is selected,
751+
# as an error will be thrown below if some names don't match
752+
if length(row) != ncols
753+
# an explicit error is thrown as this was allowed in the past
754+
throw(ArgumentError("`pushfirst!` with `cols` equal to `:setequal` " *
755+
"requires `row` to have the same number of elements " *
756+
"as the number of columns in `ds`."))
757+
end
758+
end
759+
current_col = 0
760+
current_modified = _attributes(ds).meta.modified[]
761+
try
762+
for (col, nm) in zip(_columns(ds), _names(ds))
763+
format_of_cur_col = getformat(ds, nm)
764+
current_col += 1
765+
if cols === :subset
766+
val = get(row, nm, missing)
767+
else
768+
val = row[nm]
769+
end
770+
S = typeof(val)
771+
T = eltype(col)
772+
if S <: T || !promote || promote_type(S, T) <: T
773+
pushfirst!(col, val)
774+
else
775+
newcol = similar(col, promote_type(S, T), targetrows)
776+
copyto!(newcol, 2, col, 1, nrows)
777+
newcol[1] = val
778+
firstindex(newcol) != 1 && _onebased_check_error()
779+
_columns(ds)[columnindex(ds, nm)] = newcol
780+
setformat!(ds, nm => format_of_cur_col)
781+
end
782+
end
783+
current_col = 0
784+
for col in _columns(ds)
785+
current_col += 1
786+
@assert length(col) == targetrows
787+
end
788+
catch err
789+
for col in _columns(ds)
790+
resize!(col, nrows)
791+
end
792+
_attributes(ds).meta.modified[] = current_modified
793+
@error "Error adding value to column :$(_names(ds)[current_col])."
794+
rethrow(err)
795+
end
796+
_modified(_attributes(ds))
797+
_reset_grouping_info!(ds)
798+
return ds
799+
end
800+
801+
"""
802+
pushfirst!(ds::Dataset, row::Union{Tuple, AbstractArray}; promote::Bool=false)
803+
pushfirst!(ds::Dataset, row::Union{DatasetRow, NamedTuple, AbstractDict};
804+
cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset]))
805+
806+
Add in-place one row at the beginning of `ds` taking the values from `row`.
807+
808+
Column types of `ds` are preserved, and new values are converted if necessary.
809+
An error is thrown if conversion fails.
810+
811+
If `row` is neither a `DatasetRow`, `NamedTuple` nor `AbstractDict` then
812+
it must be a `Tuple` or an `AbstractArray`
813+
and columns are matched by order of appearance. In this case `row` must contain
814+
the same number of elements as the number of columns in `ds`.
815+
816+
If `row` is a `DatasetRow`, `NamedTuple` or `AbstractDict` then
817+
values in `row` are matched to columns in `ds` based on names. The exact behavior
818+
depends on the `cols` argument value in the following way:
819+
* If `cols == :setequal` (this is the default)
820+
then `row` must contain exactly the same columns as `ds` (but possibly in a
821+
different order).
822+
* If `cols == :orderequal` then `row` must contain the same columns in the same
823+
order (for `AbstractDict` this option requires that `keys(row)` matches
824+
`propertynames(ds)` to allow for support of ordered dicts; however, if `row`
825+
is a `Dict` an error is thrown as it is an unordered collection).
826+
* If `cols == :intersect` then `row` may contain more columns than `ds`,
827+
but all column names that are present in `ds` must be present in `row` and only
828+
they are used to populate a new row in `ds`.
829+
* If `cols == :subset` then `pushfirst!` behaves like for `:intersect` but if some
830+
column is missing in `row` then a `missing` value is pushed to `ds`.
831+
* If `cols == :union` then columns missing in `ds` that are present in `row` are
832+
added to `ds` (using `missing` for existing rows) and a `missing` value is
833+
pushed to columns missing in `row` that are present in `ds`.
834+
835+
If `promote=true` and element type of a column present in `ds` does not allow
836+
the type of a pushed argument then a new column with a promoted element type
837+
allowing it is freshly allocated and stored in `ds`. If `promote=false` an error
838+
is thrown.
839+
840+
As a special case, if `ds` has no columns and `row` is a `NamedTuple` or
841+
`DatasetRow`, columns are created for all values in `row`, using their names
842+
and order.
843+
844+
Please note that `pushfirst!` must not be used on a `Dataset` that contains columns
845+
that are aliases (equal when compared with `===`).
846+
847+
# Examples
848+
```jldoctest
849+
julia> ds = Dataset(A=1:3, B=1:3);
850+
851+
julia> pushfirst!(ds, (true, false))
852+
4×2 Dataset
853+
Row │ A B
854+
│ identity identity
855+
│ Int64? Int64?
856+
─────┼────────────────────
857+
1 │ 1 0
858+
2 │ 1 1
859+
3 │ 2 2
860+
4 │ 3 3
861+
862+
julia> pushfirst!(ds, ds[1, :])
863+
5×2 Dataset
864+
Row │ A B
865+
│ identity identity
866+
│ Int64? Int64?
867+
─────┼────────────────────
868+
1 │ 1 0
869+
2 │ 1 0
870+
3 │ 1 1
871+
4 │ 2 2
872+
5 │ 3 3
873+
874+
julia> pushfirst!(ds, (C="something", A=true, B=false), cols=:intersect)
875+
6×2 Dataset
876+
Row │ A B
877+
│ identity identity
878+
│ Int64? Int64?
879+
─────┼────────────────────
880+
1 │ 1 0
881+
2 │ 1 0
882+
3 │ 1 0
883+
4 │ 1 1
884+
5 │ 2 2
885+
6 │ 3 3
886+
887+
julia> pushfirst!(ds, Dict(:A=>1.0, :C=>1.0), cols=:union)
888+
7×3 Dataset
889+
Row │ A B C
890+
│ identity identity identity
891+
│ Float64? Int64? Float64?
892+
─────┼───────────────────────────────
893+
1 │ 1.0 missing 1.0
894+
2 │ 1.0 0 missing
895+
3 │ 1.0 0 missing
896+
4 │ 1.0 0 missing
897+
5 │ 1.0 1 missing
898+
6 │ 2.0 2 missing
899+
7 │ 3.0 3 missing
900+
901+
julia> pushfirst!(ds, NamedTuple(), cols=:subset)
902+
8×3 Dataset
903+
Row │ A B C
904+
│ identity identity identity
905+
│ Float64? Int64? Float64?
906+
─────┼────────────────────────────────
907+
1 │ missing missing missing
908+
2 │ 1.0 missing 1.0
909+
3 │ 1.0 0 missing
910+
4 │ 1.0 0 missing
911+
5 │ 1.0 0 missing
912+
6 │ 1.0 1 missing
913+
7 │ 2.0 2 missing
914+
8 │ 3.0 3 missing
915+
```
916+
"""
917+
function Base.pushfirst!(ds::Dataset, row::Any; promote::Bool=false)
918+
919+
# Modify Dataset
920+
if !(row isa Union{Tuple, AbstractArray})
921+
# an explicit error is thrown as this was allowed in the past
922+
throw(ArgumentError("`pushfirst!` does not allow passing collections of type " *
923+
"$(typeof(row)) to be pushed into a Dataset. Only " *
924+
"`Tuple`, `AbstractArray`, `AbstractDict`, `DatasetRow` " *
925+
"and `NamedTuple` are allowed."))
926+
end
927+
nrows, ncols = size(ds)
928+
targetrows = nrows + 1
929+
if length(row) != ncols
930+
msg = "Length of `row` does not match `Dataset` column count."
931+
throw(DimensionMismatch(msg))
932+
end
933+
current_col = 0
934+
current_modified = _attributes(ds).meta.modified[]
935+
try
936+
for (i, (col, val)) in enumerate(zip(_columns(ds), row))
937+
current_col += 1
938+
format_of_cur_col = getformat(ds, current_col)
939+
S = typeof(val)
940+
T = eltype(col)
941+
if S <: T || !promote || promote_type(S, T) <: T
942+
pushfirst!(col, val)
943+
else
944+
newcol = allocatecol(promote_type(S, T), targetrows)
945+
copyto!(newcol, 2, col, 1, nrows)
946+
newcol[1] = val
947+
firstindex(newcol) != 1 && _onebased_check_error()
948+
_columns(ds)[i] = newcol
949+
setformat!(ds, i => format_of_cur_col)
950+
end
951+
end
952+
current_col = 0
953+
for col in _columns(ds)
954+
current_col += 1
955+
@assert length(col) == targetrows
956+
end
957+
catch err
958+
#clean up partial row
959+
for col in _columns(ds)
960+
resize!(col, nrows)
961+
end
962+
_attributes(ds).meta.modified[] = current_modified
963+
@error "Error adding value to column :$(_names(ds)[current_col])."
964+
rethrow(err)
965+
end
966+
_modified(_attributes(ds))
967+
_reset_grouping_info!(ds)
968+
ds
969+
end

src/datasetrow/datasetrow.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,8 @@ end
504504

505505
Base.push!(ds::Dataset, dsr::DatasetRow, cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) =
506506
push!(ds, NamedTuple(dsr); cols = cols, promote = promote)
507+
Base.pushfirst!(ds::Dataset, dsr::DatasetRow, cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) =
508+
pushfirst!(ds, NamedTuple(dsr); cols = cols, promote = promote)
507509
# @noinline pushhelper!(x, r) = push!(x, x[r])
508510
#
509511
# function Base.push!(ds::Dataset, dsr::DatasetRow; cols::Symbol=:setequal,

0 commit comments

Comments
 (0)