Skip to content

Commit b45e829

Browse files
committed
update docstring
1 parent 1d4d306 commit b45e829

File tree

2 files changed

+179
-0
lines changed

2 files changed

+179
-0
lines changed

src/dataset/combine.jl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,73 @@ function _combine_f_barrier_tuple(fromds, newds, msfirst, mssecond, mslast, newd
522522

523523
end
524524

525+
"""
526+
combine(ds::AbstractDataset, args...; dropgroupcols = false, threads = true)
527+
528+
Create a new data set while the `args` aggregations has been applied on passed columns. The `args` argument must be in the form of `cols=>fun=>newname`, where `cols` refers to columns in the passed data set. `fun` assumes a single column as its input, thus, multiple columns will be broadcasted, i.e. `cols=>fun` will be tranlated as `col1=>fun`, `col2=>fun`, ..., and `col=>funs` will be translated as `col=>fun1`, `col=>fun2`, .... The `byrow` function can be passed as `fun`, however, its input must be referring to columns which already an operation has been done on them.
529+
530+
For using a multivate function the columns must be passed as tuple of column names or column indices.
531+
532+
For grouped data set the operations are done on each group of observations.
533+
534+
# Examples
535+
536+
```jldoctest
537+
julia> ds = Dataset(g = [1,2,1,2,1,2], x = 1:6)
538+
6×2 Dataset
539+
Row │ g x
540+
│ identity identity
541+
│ Int64? Int64?
542+
─────┼────────────────────
543+
1 │ 1 1
544+
2 │ 2 2
545+
3 │ 1 3
546+
4 │ 2 4
547+
5 │ 1 5
548+
6 │ 2 6
549+
550+
julia> combine(groupby(ds, :g), :x=>[sum, mean])
551+
2×3 Dataset
552+
Row │ g sum_x mean_x
553+
│ identity identity identity
554+
│ Int64? Int64? Float64?
555+
─────┼──────────────────────────────
556+
1 │ 1 9 3.0
557+
2 │ 2 12 4.0
558+
559+
julia> combine(gatherby(ds, :g), :x => [maximum, minimum], 2:3 => byrow(-) => :range)
560+
2×4 Dataset
561+
Row │ g maximum_x minimum_x range
562+
│ identity identity identity identity
563+
│ Int64? Int64? Int64? Int64?
564+
─────┼──────────────────────────────────────────
565+
1 │ 1 5 1 4
566+
2 │ 2 6 2 4
567+
568+
julia> ds = Dataset(g = [1,2,1,2,1,2], x = 1:6, y = 6:-1:1)
569+
6×3 Dataset
570+
Row │ g x y
571+
│ identity identity identity
572+
│ Int64? Int64? Int64?
573+
─────┼──────────────────────────────
574+
1 │ 1 1 6
575+
2 │ 2 2 5
576+
3 │ 1 3 4
577+
4 │ 2 4 3
578+
5 │ 1 5 2
579+
6 │ 2 6 1
580+
581+
julia> combine(groupby(ds,1), (:x, :y)=>(x1,x2)->maximum(x1)-minimum(x2))
582+
2×2 Dataset
583+
Row │ g function_x_y
584+
│ identity identity
585+
│ Int64? Int64?
586+
─────┼────────────────────────
587+
1 │ 1 3
588+
2 │ 2 5
589+
590+
```
591+
"""
525592
function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, threads = true)
526593
!isgrouped(ds) && return combine_ds(ds, args...)#throw(ArgumentError("`combine` is only for grouped data sets, use `modify` instead"))
527594
idx_cpy::Index = Index(Dict{Symbol, Int}(), Symbol[], Dict{Int, Function}())

src/dataset/modify.jl

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,121 @@ function normalize_modify_multiple!(outidx::Index, idx, @nospecialize(args...))
281281
res
282282
end
283283

284+
"""
285+
modify(...)
286+
287+
A variant of `modify!` which modifies a copy of the passed data set.
288+
289+
See [`modify!`](@ref)
290+
"""
284291
modify(origninal_ds::AbstractDataset, @nospecialize(args...); threads::Bool = true) = modify!(copy(origninal_ds), args...; threads = threads)
285292

286293
modify!(ds::Dataset; threads::Bool = true) = parent(ds)
294+
295+
"""
296+
modify!(ds::AbstractDataset, args...; [threads = true])
297+
298+
Modify columns of a data set. The `args` arguments must be in the form of `cols => fun => newnames`. The `fun` function will be called on passed `cols`, with the excpetion of two special functions: `byrow` and `splitter`. `fun` assumes a single column as its input, thus passing multiple columns will be broadcasted, i.e. `cols => fun` will be translated to `col1=>fun`, `col2=>fun`, .... When `newname` is not provided `modify!` modifies the passed column.
299+
300+
When a grouped data set is passed to `modify!`, the operation is done on each group of observations.
301+
302+
each `args` can be constructed based on columns in the original data set or the columns which have been created before it.
303+
304+
# Special functions
305+
306+
`byrow` and `splitter` are two special functions which can be passed as `fun`.
307+
308+
`byrow` can accept multiple columns as input and does a given operation on each row of the data set. When a single column is passed to `byrow`, `modify!` modifies the passed column, however, when multiple columns are passed, `byrow` applies the row-wise operation on them and creates a new column.
309+
310+
`splitter` splits a column of tuples to multiple columns. When `splitter` is set as `fun` the `newnames` must be given.
311+
312+
# Using multivariate functions
313+
314+
To pass multiple columns to a `fun` function which operates on multiple inputs, the columns must be passed as tuple of column names, or column indices.
315+
316+
See [`modify`](@ref)
317+
318+
# Examples
319+
320+
```jldoctest
321+
julia> ds = Dataset(x1 = 1:5, x2 = [-2, -1, missing, 1, 2],
322+
x3 = [0.0, 0.1, 0.2, missing, 0.4])
323+
5×3 Dataset
324+
Row │ x1 x2 x3
325+
│ identity identity identity
326+
│ Int64? Int64? Float64?
327+
─────┼──────────────────────────────
328+
1 │ 1 -2 0.0
329+
2 │ 2 -1 0.1
330+
3 │ 3 missing 0.2
331+
4 │ 4 1 missing
332+
5 │ 5 2 0.4
333+
334+
julia> modify!(ds, 2:3 => sum)
335+
5×3 Dataset
336+
Row │ x1 x2 x3
337+
│ identity identity identity
338+
│ Int64? Int64? Float64?
339+
─────┼──────────────────────────────
340+
1 │ 1 0 0.7
341+
2 │ 2 0 0.7
342+
3 │ 3 0 0.7
343+
4 │ 4 0 0.7
344+
5 │ 5 0 0.7
345+
346+
julia> modify!(ds, :x1 => x -> x .- mean(x))
347+
5×3 Dataset
348+
Row │ x1 x2 x3
349+
│ identity identity identity
350+
│ Float64? Int64? Float64?
351+
─────┼──────────────────────────────
352+
1 │ -2.0 0 0.7
353+
2 │ -1.0 0 0.7
354+
3 │ 0.0 0 0.7
355+
4 │ 1.0 0 0.7
356+
5 │ 2.0 0 0.7
357+
358+
julia> body = Dataset(weight = [78.5, 59, 80], height = [160, 171, 183])
359+
3×2 Dataset
360+
Row │ weight height
361+
│ identity identity
362+
│ Float64? Int64?
363+
─────┼────────────────────
364+
1 │ 78.5 160
365+
2 │ 59.0 171
366+
3 │ 80.0 183
367+
368+
julia> modify!(body, :height => byrow(x -> (x/100)^2) => :BMI, [1, 3] => byrow(/) => :BMI)
369+
3×3 Dataset
370+
Row │ weight height BMI
371+
│ identity identity identity
372+
│ Float64? Int64? Float64?
373+
─────┼──────────────────────────────
374+
1 │ 78.5 160 30.6641
375+
2 │ 59.0 171 20.1771
376+
3 │ 80.0 183 23.8884
377+
378+
julia> body = Dataset(weight = [78.5, 59, 80], height = [160, 171, 183])
379+
3×2 Dataset
380+
Row │ weight height
381+
│ identity identity
382+
│ Float64? Int64?
383+
─────┼────────────────────
384+
1 │ 78.5 160
385+
2 │ 59.0 171
386+
3 │ 80.0 183
387+
388+
julia> modify!(body, (:weight, :height)=> cor)
389+
3×3 Dataset
390+
Row │ weight height cor_weight_height
391+
│ identity identity identity
392+
│ Float64? Int64? Float64?
393+
─────┼───────────────────────────────────────
394+
1 │ 78.5 160 0.0890411
395+
2 │ 59.0 171 0.0890411
396+
3 │ 80.0 183 0.0890411
397+
```
398+
"""
287399
function modify!(ds::AbstractDataset, @nospecialize(args...); threads::Bool = true)
288400
if ds isa SubDataset
289401
idx_cpy = copy(index(parent(ds)))

0 commit comments

Comments
 (0)