@@ -155,6 +155,42 @@ function _fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, x_l, x_r,
155
155
end
156
156
157
157
158
+ function _find_ranges_for_join_using_hash (dsl, dsr, onleft, onright, mapformats, makeunique, :: Val{T} ) where T
159
+ oncols_left = onleft
160
+ oncols_right = onright
161
+ right_cols = setdiff (1 : length (index (dsr)), oncols_right)
162
+ if ! makeunique && ! isempty (intersect (_names (dsl), _names (dsr)[right_cols]))
163
+ throw (ArgumentError (" duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." ))
164
+ end
165
+
166
+ cols = Any[]
167
+ for j in 1 : length (oncols_left)
168
+ if mapformats[1 ]
169
+ fl = getformat (dsl, oncols_left[j])
170
+ else
171
+ fl = identity
172
+ end
173
+ if mapformats[2 ]
174
+ fr = getformat (dsr, oncols_right[j])
175
+ else
176
+ fr = identity
177
+ end
178
+ push! (cols, Cat2Vec (_columns (dsl)[oncols_left[j]], _columns (dsr)[oncols_right[j]], fl, fr))
179
+ end
180
+ newds = Dataset (cols, :auto , copycols = false )
181
+ a = _gather_groups (newds, :, nrow (newds)< typemax (Int32) ? Val (Int32) : Val (Int64), stable = false , mapformats = false )
182
+
183
+ reps = _find_counts_for_join (view (a[1 ], nrow (dsl)+ 1 : length (a[1 ])), a[3 ])
184
+ gslots, minval, sz = _create_dictionary_for_join_int (identity, view (a[1 ], nrow (dsl)+ 1 : length (a[1 ])), reps, 1 , a[3 ], Val (T))
185
+
186
+ ranges = Vector {UnitRange{T}} (undef, nrow (dsl))
187
+ where = Vector {T} (undef, length (reps)+ 1 )
188
+ cumsum! (view (where , 2 : length (where )), reps)
189
+ where [1 ] = 0
190
+ _find_range_for_join! (ranges, view (a[1 ], 1 : nrow (dsl)), gslots, reps, where , 1 , sz)
191
+ ranges, a, gslots, minval, reps, sz, right_cols
192
+ end
193
+
158
194
function _join_left_dict (dsl, dsr, ranges, onleft, onright, right_cols, :: Val{T} ; makeunique = makeunique, mapformats = mapformats, check = check ) where T
159
195
_fl = _date_value∘ identity
160
196
_fr = _date_value∘ identity
@@ -231,7 +267,9 @@ function _join_left!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
231
267
end
232
268
233
269
_fill_ranges_for_dict_join! (ranges, dict, maxprob, _fl, _fr, _columns (dsl)[onleft[1 ]], _columns (dsr)[onright[1 ]], sz, type)
234
-
270
+ if ! all (x-> length (x) <= 1 , ranges)
271
+ throw (ArgumentError (" `leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set" ))
272
+ end
235
273
new_ends = map (x -> max (1 , length (x)), ranges)
236
274
cumsum! (new_ends, new_ends)
237
275
total_length = new_ends[end ]
@@ -241,10 +279,10 @@ function _join_left!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
241
279
end
242
280
243
281
for j in 1 : length (right_cols)
244
- _res = allocatecol (_columns (dsr)[right_cols[j]], total_length, addmissing = false )
282
+ _res = allocatecol (_columns (dsr)[right_cols[j]], total_length)
245
283
if DataAPI. refpool (_res) != = nothing
246
- # fill_val = DataAPI.invrefpool(_res)[missing]
247
- _fill_right_cols_table_left! (_res. refs, DataAPI. refarray (_columns (dsr)[right_cols[j]]), ranges, new_ends, total_length, missing )
284
+ fill_val = DataAPI. invrefpool (_res)[missing ]
285
+ _fill_right_cols_table_left! (_res. refs, DataAPI. refarray (_columns (dsr)[right_cols[j]]), ranges, new_ends, total_length, fill_val )
248
286
else
249
287
_fill_right_cols_table_left! (_res, _columns (dsr)[right_cols[j]], ranges, new_ends, total_length, missing )
250
288
end
@@ -376,16 +414,6 @@ function _join_outer_dict(dsl, dsr, ranges, onleft, onright, oncols_left, oncols
376
414
377
415
end
378
416
379
- function _in_use_Set (ldata, rdata, _fl, _fr)
380
- ss = Set (Base. Generator (_fr, rdata));
381
- res = Vector {Bool} (undef, length (ldata))
382
- Threads. @threads for i in 1 : length (res)
383
- res[i] = _fl (ldata[i]) in ss
384
- end
385
- res
386
- end
387
-
388
-
389
417
function _update!_dict (dsl, dsr, ranges, onleft, onright, right_cols, :: Val{T} ; allowmissing = true , mode = :all , mapformats = [true , true ], stable = false , alg = HeapSort) where T
390
418
_fl = _date_value∘ identity
391
419
_fr = _date_value∘ identity
@@ -424,3 +452,128 @@ function _update!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T};
424
452
_modified (_attributes (dsl))
425
453
true , dsl
426
454
end
455
+
456
+
457
+ # a new idea for joining without sorting
458
+ function _in_hash (dsl:: AbstractDataset , dsr:: AbstractDataset , :: Val{T} ; onleft, onright, mapformats = [true , true ]) where T
459
+ isempty (dsl) && return Bool[]
460
+ oncols_left = onleft
461
+ oncols_right = onright
462
+
463
+ # use Set when there is only one column in `on`
464
+ cols = Any[]
465
+ for j in 1 : length (oncols_left)
466
+ if mapformats[1 ]
467
+ fl = getformat (dsl, oncols_left[j])
468
+ else
469
+ fl = identity
470
+ end
471
+ if mapformats[2 ]
472
+ fr = getformat (dsr, oncols_right[j])
473
+ else
474
+ fr = identity
475
+ end
476
+ push! (cols, Cat2Vec (_columns (dsl)[oncols_left[j]], _columns (dsr)[oncols_right[j]], fl, fr))
477
+ end
478
+ newds = Dataset (cols, :auto , copycols = false )
479
+ a = _gather_groups (newds, :, nrow (newds)< typemax (Int32) ? Val (Int32) : Val (Int64), stable = false , mapformats = false )
480
+ res = _in_use_Set_int (view (a[1 ], 1 : nrow (dsl)), view (a[1 ], nrow (dsl)+ 1 : length (a[1 ])), 1 , a[3 ])
481
+ end
482
+
483
+ function _create_Set_for_join_int (f, v, minval, rangelen)
484
+ flag = false
485
+ offset = 1 - minval
486
+ n = length (v)
487
+ sz = rangelen + 1
488
+ gslots = falses (sz)
489
+ @inbounds for i in 1 : length (v)
490
+ _fv = f (v[i])
491
+ if ismissing (_fv)
492
+ slotix = sz
493
+ else
494
+ slotix = _fv + offset
495
+ end
496
+ if ! gslots[slotix]
497
+ gslots[slotix] = true
498
+ end
499
+ end
500
+ gslots, minval, sz
501
+ end
502
+
503
+ function _query_Set_for_join_int (f, fv, gslots, minval, sz)
504
+ offset = 1 - minval
505
+ if ismissing (fv)
506
+ slotix = sz
507
+ else
508
+ slotix = fv + offset
509
+ ! (slotix in 1 : sz- 1 ) && return 0
510
+ end
511
+
512
+ if slotix in 1 : sz
513
+ rowid = gslots[slotix]
514
+ return rowid
515
+ end
516
+ false
517
+ end
518
+
519
+ function _in_use_Set_int_barrier! (res, ldata, gslots, minval, sz)
520
+ Threads. @threads for i in 1 : length (res)
521
+ res[i] = _query_Set_for_join_int (identity, ldata[i], gslots, minval, sz)
522
+ end
523
+ end
524
+
525
+ function _in_use_Set_int (ldata, rdata, minval, rangelen)
526
+ gslots, minval, sz = _create_Set_for_join_int (identity, rdata, minval, rangelen)
527
+ res = Vector {Bool} (undef, length (ldata))
528
+ _in_use_Set_int_barrier! (res, ldata, gslots, minval, sz)
529
+ res
530
+ end
531
+
532
+ # f is a function which should be applied on each element of v
533
+ # v is a vector of Int with minimum minval and range length of rangelen
534
+ # reps gives how many times a specific integer will appear in v
535
+ # no missing in v
536
+ function _create_dictionary_for_join_int (f, v, reps, minval, rangelen, :: Val{T} ) where T
537
+ offset = 1 - minval
538
+ n = length (v)
539
+ where = Vector {T} (undef, length (reps)+ 1 )
540
+ cumsum! (view (where , 2 : length (where )), reps)
541
+ where [1 ] = 0
542
+ gslots = zeros (T, where [end ])
543
+ sz = rangelen
544
+ @inbounds for i in 1 : length (v)
545
+ _fv = f (v[i])
546
+ slotix = _fv + offset
547
+ gslots[where [slotix]+ 1 ] = i
548
+ where [slotix] += 1
549
+ end
550
+ gslots, minval, sz
551
+ end
552
+ # there is no missing in v
553
+ # here can be defined as:
554
+ # where = Vector{T}(undef, length(reps)+1)
555
+ # cumsum!(view(where, 2:length(where)), reps)
556
+ # where[1] = 0
557
+ function _query_dictionary_for_join_int (f, v, gslots,reps, where , minval, sz)
558
+ offset = 1 - minval
559
+ slotix = v + offset
560
+ ! (slotix in 1 : sz) && return 1 : 0
561
+ if slotix in 1 : sz
562
+ rowid = reps[slotix]
563
+ rowid == 0 && return 1 : 0
564
+ return where [slotix]+ 1 : where[slotix+ 1 ]
565
+ end
566
+ end
567
+
568
+
569
+
570
+
571
+ function _in_use_Set (ldata, rdata, _fl, _fr)
572
+
573
+ ss = Set (Base. Generator (_fr, rdata));
574
+ res = Vector {Bool} (undef, length (ldata))
575
+ Threads. @threads for i in 1 : length (res)
576
+ res[i] = _fl (ldata[i]) in ss
577
+ end
578
+ res
579
+ end
0 commit comments