Skip to content

Commit a09e1c6

Browse files
committed
support keep/leave = :random in nonunique/unique/unique!
1 parent 3e5a1ee commit a09e1c6

File tree

2 files changed

+51
-14
lines changed

2 files changed

+51
-14
lines changed

src/abstractdataset/abstractdataset.jl

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,10 @@ A row is a duplicate if there exists a prior row with all columns containing
989989
equal values (according to `isequal`).
990990
991991
If `mapformats = true` the values are checked based on their formatted values.
992-
`leave = :first` means that everey occurance after the first one be marked as non-unique value, and
993-
`leave = :last` means that every occurance before the last one be marked as non-unique value.
994-
`leave = :none` means that no duplicated rows are marked as non-unique value.
992+
`leave = :first` means that everey occurance after the first one be marked as non-unique value,
993+
`leave = :last` means that every occurance before the last one be marked as non-unique value,
994+
`leave = :none` means that no duplicated rows are marked as non-unique value,
995+
`leave = :random` means that a random occurance of duplicated values be marked as non-unique value.
995996
996997
See also [`unique`](@ref) and [`unique!`](@ref).
997998
@@ -1052,13 +1053,16 @@ julia> nonunique(ds, 2)
10521053
"""
10531054
function nonunique(ds::AbstractDataset, cols::MultiColumnIndex = :; mapformats = false, leave = :first)
10541055
# :xor, :nor, :and, :or are undocumented
1055-
!(leave in (:first, :last, :none, :xor, :nand, :nor, :and, :or)) && throw(ArgumentError("`leave` must be either `:first`, `:last`, or `:none`"))
1056+
!(leave in (:first, :last, :none, :random, :xor, :nand, :nor, :and, :or)) && throw(ArgumentError("`leave` must be either `:first`, `:last`, `:none`, or `random`"))
10561057
if ncol(ds) == 0
10571058
throw(ArgumentError("finding duplicate rows in data set with no " *
10581059
"columns is not allowed"))
10591060
end
10601061

10611062
groups, gslots, ngroups = _gather_groups(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), mapformats = mapformats, stable = false)
1063+
if leave === :random
1064+
return _nonunique_random_leave(groups, ngroups, nrow(ds))
1065+
end
10621066
res = trues(nrow(ds))
10631067
seen_groups = falses(ngroups)
10641068
if leave === :first
@@ -1105,6 +1109,29 @@ function _nonunique_barrier!(res, groups, seen_groups; first = true)
11051109
nothing
11061110
end
11071111

1112+
function _nonunique_random_leave_barrier!(counts, groups)
1113+
for i in 1:length(groups)
1114+
counts[groups[i]] += 1
1115+
end
1116+
map!(x->rand(1:x), counts, counts)
1117+
end
1118+
1119+
function _fill_nonunique_randomleave!(res, counts, groups)
1120+
for i in 1:length(res)
1121+
res[i] = counts[groups[i]] == 1
1122+
counts[groups[i]] -= 1
1123+
end
1124+
end
1125+
1126+
function _nonunique_random_leave(groups, ngroups, nrows)
1127+
counts = Vector{nrows < typemax(Int32) ? Int32 : Int64}(undef, ngroups)
1128+
fill!(counts, 0)
1129+
_nonunique_random_leave_barrier!(counts, groups)
1130+
res = falses(nrows)
1131+
_fill_nonunique_randomleave!(res, counts, groups)
1132+
.!res
1133+
end
1134+
11081135
"""
11091136
vcat(dss::AbstractDataset...;
11101137
cols::Union{Symbol, AbstractVector{Symbol},

src/dataset/other.jl

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -622,35 +622,41 @@ function _unique_none_case(ds::Dataset, cols; mapformats = false)
622622
end
623623

624624
# Modify Dataset
625-
function Base.unique!(ds::Dataset; mapformats = false, keep = :first)
626-
!(keep in (:first, :last, :none, :only)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, or :none"))
625+
function Base.unique!(ds::AbstractDataset; mapformats = false, keep = :first)
626+
!(keep in (:first, :last, :none, :only, :random)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, :none, or :random"))
627627
if keep == :none
628628
rowidx = nonunique(ds, mapformats = mapformats, leave = :or)
629629
elseif keep == :only
630630
rowidx = nonunique(ds, mapformats = mapformats, leave = :none)
631+
elseif keep == :random
632+
rowidx = nonunique(ds, mapformats = mapformats, leave = :random)
631633
else
632634
rowidx = nonunique(ds, mapformats = mapformats, leave = keep)
633635
end
634636
deleteat!(ds, rowidx)
635637
end
636638
# this is for fixing ambiguity
637-
function Base.unique!(ds::Dataset, cols::AbstractVector; mapformats = false, keep = :first)
638-
!(keep in (:first, :last, :none, :only)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, or :none"))
639+
function Base.unique!(ds::AbstractDataset, cols::AbstractVector; mapformats = false, keep = :first)
640+
!(keep in (:first, :last, :none, :only, :random)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, :none, or :random"))
639641
if keep == :none
640642
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :or)
641643
elseif keep == :only
642644
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :none)
645+
elseif keep == :random
646+
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :random)
643647
else
644648
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = keep)
645649
end
646650
deleteat!(ds, rowidx)
647651
end
648-
function Base.unique!(ds::Dataset, cols; mapformats = false, keep = :first)
649-
!(keep in (:first, :last, :none, :only)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, or :none"))
652+
function Base.unique!(ds::AbstractDataset, cols; mapformats = false, keep = :first)
653+
!(keep in (:first, :last, :none, :only, :random)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, :none, or :random"))
650654
if keep == :none
651655
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :or)
652656
elseif keep == :only
653657
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :none)
658+
elseif keep == :random
659+
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = :random)
654660
else
655661
rowidx = nonunique(ds, cols, mapformats = mapformats, leave = keep)
656662
end
@@ -660,23 +666,27 @@ end
660666

661667
# Unique rows of an Dataset.
662668
@inline function Base.unique(ds::AbstractDataset; view::Bool=false, mapformats = false, keep = :first)
663-
!(keep in (:first, :last, :none, :only)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, or :none"))
669+
!(keep in (:first, :last, :none, :only, :random)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, :none, or :random"))
664670
if keep == :none
665671
rowidxs = nonunique(ds, mapformats = mapformats, leave = :none)
666672
elseif keep == :only
667673
rowidxs = nonunique(ds, mapformats = mapformats, leave = :or)
674+
elseif keep == :random
675+
rowidxs = .!nonunique(ds, mapformats = mapformats, leave = :random)
668676
else
669677
rowidxs = (!).(nonunique(ds, mapformats = mapformats, leave = keep))
670678
end
671679
return view ? Base.view(ds, rowidxs, :) : ds[rowidxs, :]
672680
end
673681

674682
@inline function Base.unique(ds::AbstractDataset, cols; view::Bool=false, mapformats = false, keep = :first)
675-
!(keep in (:first, :last, :none, :only)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, or :none"))
683+
!(keep in (:first, :last, :none, :only, :random)) && throw(ArgumentError( "The `keep` keyword argument must be one of :first, :last, :only, :none, or :random"))
676684
if keep == :none
677685
rowidxs = nonunique(ds, cols, mapformats = mapformats, leave = :none)
678686
elseif keep == :only
679687
rowidxs = nonunique(ds, cols, mapformats = mapformats, leave = :or)
688+
elseif keep == :random
689+
rowidxs = .!nonunique(ds, cols, mapformats = mapformats, leave = :random)
680690
else
681691
rowidxs = (!).(nonunique(ds, cols, mapformats = mapformats, leave = keep))
682692
end
@@ -688,10 +698,10 @@ end
688698
unique(ds::AbstractDataset, cols = : ; [mapformats = false, keep = :first, view::Bool=false])
689699
unique!(ds::AbstractDataset, cols = : ; [mapformats = false, keep = :first, view::Bool=false])
690700
691-
Return a data set containing only the unique occurrence of unique rows in `ds` where `keep` can be one of the following value: `:first`, `:last`, `:none`, or `:only`. The
701+
Return a data set containing only the unique occurrence of unique rows in `ds` where `keep` can be one of the following value: `:first`, `:last`, `:none`, `:only`, or `:random`. The
692702
`keep` keyword argument detemines which occurrence of the unique value should be kept, i.e. when `keep = :first` the
693703
first occurrence of the unique value will be kept and when `keep = :last` the last occurrence will be kept. When `keep` is set to `:none`
694-
all duplicates will be dropped from the result, and when `keep` is set to `:only` only duplicated rows are kept.
704+
all duplicates will be dropped from the result, when `keep` is set to `:only` only duplicated rows are kept, and when `keep` is set ot `:random` a random occurance of duplicates will be kept.
695705
When `cols` is specified, the unique occurrence is detemined by given combination of values
696706
in selected columns. `cols` can be any column selector.
697707

0 commit comments

Comments
 (0)