improve performance

sl-solution · sl-solution · commit 820bcbe2b130 · 2022-03-20T10:17:47.000+13:00
diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl
@@ -140,8 +140,15 @@ byrow(ds::AbstractDataset, ::typeof(stdze), cols::MultiColumnIndex = names(ds, U
 
 byrow(ds::AbstractDataset, ::typeof(stdze!), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); threads = true) = row_stdze!(ds, cols, threads = threads)
 
-byrow(ds::AbstractDataset, ::typeof(hash), cols::MultiColumnIndex = :; by = identity, threads = nrow(ds) > __NCORES*10) = row_hash(ds, by, cols, threads = threads)
-byrow(ds::AbstractDataset, ::typeof(hash), col::ColumnIndex; by = identity, threads = nrow(ds) > __NCORES*10) = byrow(ds, hash, [col]; by = by, threads = threads)
+function byrow(ds::AbstractDataset, ::typeof(hash), cols::MultiColumnIndex = :; by = identity, mapformats = false, threads = nrow(ds) > __NCORES*10)
+	colsidx = multiple_getindex(index(ds), cols)
+	if mapformats
+		by = map(y->expand_Base_Fix(by, getformat(ds, y)), colsidx)
+	end
+	row_hash(ds, by, cols, threads = threads)
+end
+
+byrow(ds::AbstractDataset, ::typeof(hash), col::ColumnIndex; by = identity, mapformats = false, threads = nrow(ds) > __NCORES*10) = byrow(ds, hash, [col]; by = by, mapformats = mapformats, threads = threads)
 
 byrow(ds::AbstractDataset, ::typeof(join), col::MultiColumnIndex; threads = nrow(ds) > __NCORES*10, delim = "", last = "") = row_join(ds, col, threads = threads, delim = delim, last = last)
 
diff --git a/src/byrow/row_functions.jl b/src/byrow/row_functions.jl
@@ -1096,19 +1096,33 @@ Base.@propagate_inbounds function _op_for_hash!(x, y, f, lo, hi)
     x
 end
 
-function row_hash(ds::AbstractDataset, f::Function, cols = :; threads = true)
+function row_hash(ds::AbstractDataset, f::Union{AbstractVector{<:Function}, Function}, cols = :; threads = true)
     colsidx = multiple_getindex(index(ds), cols)
     init0 = zeros(UInt64, nrow(ds))
 
+    multi_f = false
+    if f isa AbstractVector
+        @assert length(f) == length(colsidx) "number of provided functions must match the number of selected columns"
+        multi_f = true
+    end
+
     if threads
         cz = div(length(init0), __NCORES)
         Threads.@threads for i in 1:__NCORES
             lo = (i-1)*cz+1
             i == __NCORES ? hi = length(init0) : hi = i*cz
-            mapreduce(identity, (x,y) -> _op_for_hash!(x, y, f, lo, hi), view(_columns(ds),colsidx), init = init0)
+            if multi_f
+                mapreduce_index(f, (x, y, func) -> _op_for_hash!(x, y, func, lo, hi), view(_columns(ds),colsidx), init0)
+            else
+                mapreduce(identity, (x,y) -> _op_for_hash!(x, y, f, lo, hi), view(_columns(ds),colsidx), init = init0)
+            end
         end
     else
-        mapreduce(identity, (x,y) -> _op_for_hash!(x, y, f, 1, length(x)), view(_columns(ds),colsidx), init = init0)
+        if multi_f
+            mapreduce_index(f, (x, y, func) -> _op_for_hash!(x, y, func, 1, length(x)), view(_columns(ds),colsidx), init0)
+        else
+            mapreduce(identity, (x,y) -> _op_for_hash!(x, y, f, 1, length(x)), view(_columns(ds),colsidx), init = init0)
+        end
     end
     init0
 end
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -409,6 +409,15 @@ end
 function _gather_groups(ds, cols, ::Val{T}; mapformats = false, stable = true, threads = true) where T
     colidx = index(ds)[cols]
     _max_level = nrow(ds)
+
+
+	if nrow(ds) > 2^23 && !stable && 5<length(colidx)<16 # the result is stable anyway
+		if !mapformats || all(==(identity), getformat.(Ref(ds), colidx))
+			return _gather_groups_hugeds_multicols(ds, cols, Val(T); threads = threads)
+		end
+	end
+
+
     prev_max_group = UInt(1)
     prev_groups = ones(T, nrow(ds))
     groups = T[]
@@ -510,6 +519,64 @@ function _find_groups_with_more_than_one_observation_barrier!(res, groups, seen_
     nothing
 end
 
+### Special path for huge ds and multiple cols - trade off between compilation and performance
+# table columns are passed as a tuple of vectors to ensure type specialization - From DataFrames.jl
+isequal_row(cols::Tuple{AbstractVector}, r1::Int, r2::Int) =
+    isequal(cols[1][r1], cols[1][r2])
+isequal_row(cols::Tuple{Vararg{AbstractVector}}, r1::Int, r2::Int) =
+    isequal(cols[1][r1], cols[1][r2]) && isequal_row(Base.tail(cols), r1, r2)
+
+isequal_row(cols1::Tuple{AbstractVector}, r1::Int, cols2::Tuple{AbstractVector}, r2::Int) =
+    isequal(cols1[1][r1], cols2[1][r2])
+isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
+            cols2::Tuple{Vararg{AbstractVector}}, r2::Int) =
+    isequal(cols1[1][r1], cols2[1][r2]) &&
+        isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2)
+
+
+_grabrefs(x) = DataAPI.refpool(x) == nothing ? x : DataAPI.refarray(x)
+function _gather_groups_hugeds_multicols(ds, cols, ::Val{T}; threads = true) where T
+	colidx = index(ds)[cols]
+	rhashes = byrow(ds, hash, cols, threads = threads)
+	colsvals = ntuple(i->_grabrefs(_columns(ds)[colidx[i]]), length(colidx))
+	create_dict_hugeds_multicols(colsvals, rhashes, Val(T))
+end
+
+function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T
+	sz = max(1 + ((5 * length(rhashes)) >> 2), 16)
+    sz = 1 << (8 * sizeof(sz) - leading_zeros(sz - 1))
+    @assert 4 * sz >= 5 * length(rhashes)
+    szm1 = sz-1
+    gslots = zeros(T, sz)
+	groups = Vector{T}(undef, length(rhashes))
+    ngroups = 0
+    @inbounds for i in eachindex(rhashes)
+        # find the slot and group index for a row
+        slotix = rhashes[i] & szm1 + 1
+        gix = -1
+        probe = 0
+        while true
+            g_row = gslots[slotix]
+            if g_row == 0 # unoccupied slot, current row starts a new group
+                gslots[slotix] = i
+                gix = ngroups += 1
+                break
+            elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
+                if isequal_row(colvals, i, Int(g_row)) # hit
+                    gix = groups[g_row]
+                    break
+                end
+            end
+            slotix = slotix & szm1 + 1 # check the next slot
+            probe += 1
+            @assert probe < sz
+        end
+        groups[i] = gix
+    end
+    return groups, gslots, ngroups
+end
+
+
 function _gather_groups_old_version(ds, cols, ::Val{T}; mapformats = false) where T
     colidx = index(ds)[cols]
     _max_level = nrow(ds)
diff --git a/src/precompile/warmup.jl b/src/precompile/warmup.jl
@@ -34,6 +34,7 @@ function warmup()
     combine(gatherby(ds,1), Ref([1,2,3,7,8]) .=> [median, sort])
     combine(gatherby(ds,1), Ref([1,2,3,7,8]) .=> [sum, mean, length, maximum, minimum, var, std])
     combine(gatherby(ds,1), r"x1$" .=> [sum, mean, length, maximum, minimum, var, std])
+    IMD._gather_groups_hugeds_multicols(ds, 1:6, Val(Int32), threads = true)
 
     ds2 = ds[1:2, [1,3,7]]
     innerjoin(ds, ds2, on = [:x1, :x3, :x7])
diff --git a/src/sort/gatherby.jl b/src/sort/gatherby.jl
@@ -104,13 +104,27 @@ function gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats::Bool
 			return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds))
 		else
 			a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads)
-    		return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing)
+			return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing)
 		end
 	end
 end
 gatherby(ds::AbstractDataset, col::ColumnIndex; mapformats = true, stable = true, isgathered = false, eachrow = false, threads = true) = gatherby(ds, [col], mapformats = mapformats, stable = stable, isgathered = isgathered, eachrow = eachrow, threads = threads)
 
 
+__SPFRMT(x) = x & 1023
+__SPFRMT(::Missing) = missing # not needed
+
+# currently not been used in gatherby
+# use sort and format trick for fast gatherby - hm stands for high memory footprint
+function hm_gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats = false, threads = true)
+	modify!(ds, cols=>byrow(hash; threads = threads, mapformats = mapformats)=>:___tmp___cols8934, :___tmp___cols8934=>identity=>:___tmp___cols8934_2)
+	setformat!(ds, :___tmp___cols8934_2=>__SPFRMT)
+	gds = groupby(ds, [:___tmp___cols8934_2, :___tmp___cols8934], stable = false, threads = threads)
+	grpcols, ranges, last_valid_index = _find_starts_of_groups(view(ds, gds.perm, cols), cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); mapformats = mapformats, threads = threads)
+	select!(ds, Not([:___tmp___cols8934, :___tmp___cols8934_2]))
+	GatherBy(ds, grpcols, nothing, last_valid_index, mapformats, gds.perm, ranges)
+end
+
 function _fill_mapreduce_col!(x, f, op, y, loc)
     @inbounds for i in 1:length(y)
         x[loc[i]] = op(x[loc[i]], f(y[i]))