sl-solution
diff --git a/‎docs/src/man/gallery.md‎
Lines changed: 10 additions & 2 deletions b/‎docs/src/man/gallery.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎docs/src/man/joins.md‎
Lines changed: 67 additions & 0 deletions b/‎docs/src/man/joins.md‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎src/InMemoryDatasets.jl‎
Lines changed: 1 addition & 0 deletions b/‎src/InMemoryDatasets.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/dataset/other.jl‎
Lines changed: 0 additions & 101 deletions b/‎src/dataset/other.jl‎
Lines changed: 0 additions & 101 deletions
diff --git a/‎src/join/closejoin.jl‎
Lines changed: 11 additions & 2 deletions b/‎src/join/closejoin.jl‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/join/compare.jl‎
Lines changed: 64 additions & 0 deletions b/‎src/join/compare.jl‎
Lines changed: 64 additions & 0 deletions
@@ -7,7 +7,15 @@ This gallery contains some random questions about data manipulation that we foun
 * [Tally across columns with variable condition](https://stackoverflow.com/questions/70501316/tally-across-columns-with-variable-condition-in-r) : I am trying to tally across columns of a data frame with values that exceed a corresponding limit variable.
 
 ```julia
-julia> ds
+julia> ds = Dataset([[1.66077, -1.05298, -0.499206, 2.47123, 2.45914, 1.14014],
+                     [0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
+                     [0.709184, -2.53609, 0.0130659, -0.587867, 0.55786, 1.60398],
+                     [0.333, 0.333, 0.333, 0.333, 0.333, 0.333],
+                     [1.47438, 2.01485, 2.49006, 1.80345, 0.569928, 1.58403],
+                     [1, 1, 1, 1, 1, 1],
+                     [2.02678, 1.51587, 1.70535, 2.51628, 1.909, 0.794765],
+                     [1.25, 1.25, 1.25, 1.25, 1.25, 1.25]],
+                     ["a", "a_lim", "b", "b_lim", "c", "c_lim", "d", "d_lim"])
 6×8 Dataset
  Row │ a          a_lim     b           b_lim     c         c_lim     d         d_lim    
      │ identity   identity  identity    identity  identity  identity  identity  identity
@@ -22,7 +30,7 @@ julia> ds
 
 julia> using Chain
 julia> @chain ds begin
-         compare(_[!, r"lim"], _[!, Not(r"lim")], on = 1:4 .=> 1:4), eq = isless)
+         compare(_[!, r"lim"], _[!, Not(r"lim")], cols = 1:4 .=> 1:4, eq = isless)
          byrow(count)
        end
 6-element Vector{Int32}:
 
@@ -531,3 +531,70 @@ julia> update(main, transaction, on = [:group, :id],
    6 │ G2               1        2.1   missing
    7 │ G2               2        0.0         2
 ```
+
+## `compare`
+
+The `compare` function compares two data sets. When the columns which needed to be compared are specified via the `cols` keyword argument, `compare` compares the corresponding values in each row by calling `eq` on the actual or formatted values. By default, `compare` compares two values via the `isequal` function, however, users may pass any function (that returns `true`/`false`) via the `eq` keyword arguments. When the number of rows of two data sets are not matched, `compare` fills the output data set with `missing`. Users can pass key columns to perform comparing matched pairs of observations. The key columns can be passed via the `on` keyword argument. The `compare` function uses `outerjoin` to find the corresponding matches, this also means, the `compare` function can accept the arguments of `outerjoin`.
+
+> To pass the `mapformats` keyword argument to `outerjoin` in `compare`, use the `on_mapformats` keyword argument, since the `mapformats` keyword argument in `compare` refers to how observations should be compared; based on actual values or formatted values.
+
+By default, the output data set contains observations id when users pass the `on` keyword argument. When an observation exists in only one of the passed data sets, the observation id will be missing for the other one.
+
+### Examples
+
+```jldoctest
+julia> old = Dataset(Insurance_Id=[1,2,3,5],Business_Id=[10,20,30,50],
+                     Amount=[100,200,300,missing],
+                     Account_Id=["x1","x10","x5","x5"])
+4×4 Dataset
+ Row │ Insurance_Id  Business_Id  Amount    Account_Id
+     │ identity      identity     identity  identity   
+     │ Int64?        Int64?       Int64?    String?    
+─────┼─────────────────────────────────────────────────
+   1 │            1           10       100  x1
+   2 │            2           20       200  x10
+   3 │            3           30       300  x5
+   4 │            5           50   missing  x5
+
+julia> new = Dataset(Ins_Id=[1,3,2,4,3,2],
+                     B_Id=[10,40,30,40,30,20],
+                     AMT=[100,200,missing,-500,350,700],
+                     Ac_Id=["x1","x1","x10","x10","x7","x5"])
+6×4 Dataset
+ Row │ Ins_Id    B_Id      AMT       Ac_Id    
+     │ identity  identity  identity  identity
+     │ Int64?    Int64?    Int64?    String?  
+─────┼────────────────────────────────────────
+   1 │        1        10       100  x1
+   2 │        3        40       200  x1
+   3 │        2        30   missing  x10
+   4 │        4        40      -500  x10
+   5 │        3        30       350  x7
+   6 │        2        20       700  x5
+
+julia> eq_fun(x::Number, y::Number) = abs(x - y) <= 50
+eq_fun (generic function with 3 methods)
+
+julia> eq_fun(x::AbstractString, y::AbstractString) = isequal(x,y)
+eq_fun (generic function with 2 methods)
+
+julia> eq_fun(x,y) = missing
+eq_fun (generic function with 3 methods)
+
+julia> compare(old, new,
+                  on = [1=>1,2=>2],
+                  cols = [:Amount=>:AMT, :Account_Id=>:Ac_Id],
+                  eq = eq_fun)
+7×6 Dataset
+ Row │ Insurance_Id  Business_Id  obs_id_left  obs_id_right  Amount=>AMT  Account_Id=>Ac_Id
+     │ identity      identity     identity     identity      identity     identity          
+     │ Int64?        Int64?       Int32?       Int32?        Bool?        Bool?             
+─────┼──────────────────────────────────────────────────────────────────────────────────────
+   1 │            1           10            1             1         true               true
+   2 │            2           20            2             6        false              false
+   3 │            3           30            3             5         true              false
+   4 │            5           50            4       missing      missing            missing
+   5 │            2           30      missing             3      missing            missing
+   6 │            3           40      missing             2      missing            missing
+   7 │            4           40      missing             4      missing            missing
+```
@@ -174,6 +174,7 @@ include("join/join.jl")
 include("join/join_dict.jl")
 include("join/closejoin.jl")
 include("join/update.jl")
+include("join/compare.jl")
 include("join/main.jl")
 
 include("abstractdataset/iteration.jl")
 
@@ -889,107 +889,6 @@ function dropmissing!(ds::Dataset,
     ds
 end
 
-function _compare_barrier_function_threaded!(_res, xl, xr, fl, fr, eq_fun)
-    Threads.@threads for i in 1:length(xl)
-        _res[i] = eq_fun(fl(xl[i]), fr(xr[i]))
-    end
-    _res
-end
-function _compare_barrier_function!(_res, xl, xr, fl, fr, eq_fun)
-    for i in 1:length(xl)
-        _res[i] = eq_fun(fl(xl[i]), fr(xr[i]))
-    end
-    _res
-end
-
-
-"""
-    compare(ds1::AbstractDataset, ds2::AbstractDataset; [on = nothing, eq = isequal, mapformats = false, threads = true])
-
-Compare values of two data sets column by column. It returns a boolean data set which is the result of calling  `eq` on each value of
-corresponding columns. The `on` keyword can be used to specifiy the pair of columns which is needed to be compared. The `mapformats` keyword
-controls whether the actual values or the formatted values should be compared.
-
-```julia
-julia> ds1 = Dataset(x = 1:9, y = 9:-1:1);
-julia> ds2 = Dataset(x = 1:9, y2 = 9:-1:1, y3 = 1:9);
-julia> compare(ds1, ds2, on = [:x=>:x, :y=>:y2])
-9×2 Dataset
- Row │ x=>x      y=>y2
-     │ identity  identity
-     │ Bool?     Bool?
-─────┼────────────────────
-   1 │     true      true
-   2 │     true      true
-   3 │     true      true
-   4 │     true      true
-   5 │     true      true
-   6 │     true      true
-   7 │     true      true
-   8 │     true      true
-   9 │     true      true
-
-julia> compare(ds1, ds2, on = [:x=>:x, :y=>:y3])
-9×2 Dataset
- Row │ x=>x      y=>y3
-     │ identity  identity
-     │ Bool?     Bool?
-─────┼────────────────────
-   1 │     true     false
-   2 │     true     false
-   3 │     true     false
-   4 │     true     false
-   5 │     true      true
-   6 │     true     false
-   7 │     true     false
-   8 │     true     false
-   9 │     true     false
-
-```
-"""
-function compare(ds1::AbstractDataset, ds2::AbstractDataset; on = nothing, eq = isequal, mapformats = false, threads = true)
-    if !(mapformats isa AbstractVector)
-        mapformats = repeat([mapformats], 2)
-    else
-        length(mapformats) !== 2 && throw(ArgumentError("`mapformats` must be a Bool or a vector of Bool with size two"))
-    end
-    if on === nothing
-        left_col_idx = 1:ncol(ds1)
-        right_col_idx = index(ds2)[names(ds1)]
-    elseif typeof(on) <: AbstractVector{<:Union{AbstractString, Symbol}}
-        left_col_idx = index(ds1)[on]
-        right_col_idx = index(ds2)[names(ds1)[left_col_idx]]
-    elseif (typeof(on) <: AbstractVector{<:Pair{<:ColumnIndex, <:ColumnIndex}}) || (typeof(on) <: AbstractVector{<:Pair{<:AbstractString, <:AbstractString}})
-        left_col_idx = index(ds1)[map(x->x.first, on)]
-        right_col_idx = index(ds2)[map(x->x.second, on)]
-    else
-        throw(ArgumentError("`on` keyword must be a vector of column names or a vector of pairs of column names"))
-    end
-
-    nrow(ds1) != nrow(ds2) && throw(ArgumentError("the number of rows for both data sets should be the same"))
-    res = Dataset()
-    for j in 1:length(left_col_idx)
-        _res = allocatecol(Union{Bool, Missing}, nrow(ds1))
-        fl = identity
-        if mapformats[1]
-            fl = getformat(ds1, left_col_idx[j])
-        end
-        fr = identity
-        if mapformats[2]
-            fr = getformat(ds2, right_col_idx[j])
-        end
-        if threads
-            _compare_barrier_function_threaded!(_res, _columns(ds1)[left_col_idx[j]], _columns(ds2)[right_col_idx[j]], fl, fr, eq)
-        else
-            _compare_barrier_function!(_res, _columns(ds1)[left_col_idx[j]], _columns(ds2)[right_col_idx[j]], fl, fr, eq)
-        end
-        push!(_columns(res), _res)
-        push!(index(res),  Symbol(names(ds1)[left_col_idx[j]]* "=>" * names(ds2)[right_col_idx[j]]))
-    end
-    res
-end
-
-
 
 """
     describe(ds::AbstractDataset; cols=:, threads = true)
 
@@ -308,7 +308,7 @@ end
 
 
 # border = :nearest | :missing | :none
-function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, border = :nearest, mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, direction = :backward, inplace = false, tol = nothing,  allow_exact_match = true, op = nothing, method = :sort, threads = true) where T
+function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, border = :nearest, mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, direction = :backward, inplace = false, tol = nothing,  allow_exact_match = true, op = nothing, method = :sort, threads = true, obs_id = false, obs_id_name = :obs_id) where T
     isempty(dsl) && return copy(dsl)
     if !allow_exact_match
         #aem is the function to check allow_exact_match
@@ -380,7 +380,16 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m
         push!(index(newds), new_var_name)
         setformat!(newds, index(newds)[new_var_name], getformat(dsr, _names(dsr)[right_cols[j]]))
     end
-
+    if obs_id
+        obs_id_name1 = Symbol(obs_id_name, "_left")
+        obs_id_name2 = Symbol(obs_id_name, "_right")
+        obs_id_left = allocatecol(T, total_length)
+        obs_id_right = allocatecol(T, total_length)
+        obs_id_left .= 1:nrow(dsl)
+        _fill_right_cols_table_close!(obs_id_right, idx, ranges, total_length, border, missing, direction; nn = direction == :nearest, rnn = view(_columns(dsr)[oncols_right[end]], idx), lnn = _columns(dsl)[oncols_left[end]], tol = tol, aem = aem, op = op, threads = threads)
+        insertcols!(newds, ncol(newds)+1, obs_id_name1 => obs_id_left, unsupported_copy_cols = false)
+        insertcols!(newds, ncol(newds)+1, obs_id_name2 => obs_id_right, unsupported_copy_cols = false)
+    end
     if inplace
         _modified(_attributes(newds))
     end
 
@@ -0,0 +1,64 @@
+function _fill_index_compare!(x, r)
+    @simd for i in r
+        x[i] = i
+    end
+end
+function _compare(dsl, dsr, ::Val{T}; onleft, onright, cols_left, cols_right, check = true, mapformats = false, on_mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, method = :sort, threads = true, eq = isequal, obs_id_name = :obs_id, multiple_match = false, multiple_match_name = :multiple, drop_obs_id = true) where T
+    names_left = names(dsl)[cols_left]
+    names_right = names(dsr)[cols_right]
+    if !(mapformats isa AbstractVector)
+        mapformats = repeat([mapformats], 2)
+    else
+        length(mapformats) !== 2 && throw(ArgumentError("`mapformats` must be a Bool or a vector of Bool with size two"))
+    end
+
+    if onleft == nothing
+        n_dsl = nrow(dsl)
+        n_dsr = nrow(dsr)
+        total_length = max(n_dsl, n_dsr)
+        obs_id_left = _missings(T, total_length)
+        obs_id_right = _missings(T, total_length)
+        _fill_index_compare!(obs_id_left, 1:n_dsl)
+        _fill_index_compare!(obs_id_right, 1:n_dsr)
+        res = Dataset(x1=obs_id_left, x2=obs_id_right, copycols = false)
+        rename!(res, :x1=>Symbol(obs_id_name, "_left"), :x2=>Symbol(obs_id_name, "_right"))
+    else
+        res = outerjoin(dsl[!, onleft], dsr[!, onright], on = onleft .=> onright, check = check, mapformats = on_mapformats, stable = stable, alg = alg, accelerate = accelerate, method = method, threads = threads, obs_id = true, obs_id_name = obs_id_name, multiple_match = multiple_match, multiple_match_name = multiple_match_name)
+        total_length = nrow(res)
+        obs_cols = index(res)[[Symbol(obs_id_name, "_left"), Symbol(obs_id_name, "_right")]]
+        obs_id_left = _columns(res)[obs_cols[1]]
+        obs_id_right = _columns(res)[obs_cols[2]]
+    end
+    _info_cols = ncol(res)
+    for j in 1:length(cols_left)
+            fl = identity
+            if mapformats[1]
+                fl = getformat(dsl, cols_left[j])
+            end
+            fr = identity
+            if mapformats[2]
+                fr = getformat(dsr, cols_right[j])
+            end
+            _res = allocatecol(Bool, total_length)
+            _compare_barrier_function!(_res, _columns(dsl)[cols_left[j]], _columns(dsr)[cols_right[j]], fl, fr, eq, obs_id_left, obs_id_right, threads)
+
+            push!(_columns(res), _res)
+            push!(index(res),  Symbol(names(dsl)[cols_left[j]]* "=>" * names(dsr)[cols_right[j]]))
+    end
+    if drop_obs_id
+        select!(res, Not([Symbol(obs_id_name, "_left"), Symbol(obs_id_name, "_right")]))
+    end
+    res
+end
+
+
+function _compare_barrier_function!(_res, xl, xr, fl, fr, eq_fun, obs_id_left, obs_id_right, threads)
+    @_threadsfor threads for i in 1:length(_res)
+        if ismissing(obs_id_left[i]) || ismissing(obs_id_right[i])
+            _res[i] = missing
+        else
+            _res[i] = eq_fun(fl(xl[obs_id_left[i]]), fr(xr[obs_id_right[i]]))
+        end
+    end
+    _res
+end