Skip to content

Commit f986554

Browse files
committed
handling corner case - when a column is of type Missing
1 parent 917981e commit f986554

File tree

6 files changed

+16
-7
lines changed

6 files changed

+16
-7
lines changed

src/dataset/combine.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -314,14 +314,14 @@ function _check_mutliple_rows_for_each_group(ds, ms)
314314
# byrow are not checked since they are not going to modify the number of rows
315315
if ms[i].first isa Tuple
316316
T = return_type(ms[i].second.first, ntuple(j-> ds[!, ms[i].first[j]].val, length(ms[i].first)))
317-
if T <: AbstractVector
317+
if T <: AbstractVector && T !== Union{}
318318
return i
319319
end
320320
elseif !(ms[i].second.first isa Expr) &&
321321
haskey(index(ds), ms[i].first) #&&
322322
#!(ms[i].first ∈ map(x->x.second.second, view(ms, 1:(i-1)))) #TODO monitor this for any unseen problem
323323
T = return_type(ms[i].second.first, ds[!, ms[i].first].val)
324-
if T <: AbstractVector
324+
if T <: AbstractVector && T !== Union{}
325325
return i
326326
end
327327
end
@@ -409,7 +409,7 @@ function _check_the_output_type(x, mssecond)
409409
# * AbstractVector{T} where T
410410
# * Vector{T}
411411
# * not a Vector
412-
CT == Union{} && throw(ArgumentError("compiler cannot assess the return type of calling `$(mssecond)` on input, you may want to try using `byrow`"))
412+
CT == Union{} && throw(ArgumentError("compiler cannot assess the return type of calling `$(mssecond)` on input, you may want to try using `byrow`."))
413413
if CT <: AbstractVector
414414
if hasproperty(CT, :var)
415415
T = Union{Missing, CT.var.ub}

src/dataset/transpose.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# a helper function that checks if there is enough memory for the output data frame
22
# If type is not Number, probably something is wrong about setting the variables and it is better to be conservative. here 10^7 threshhold is arbitarary
3-
_check_allocation_limit(T, rows, cols) = T <: Number ? sizeof(T)*rows*cols / Base.Sys.total_memory() : rows*cols/10^7
3+
_check_allocation_limit(T, rows, cols) = T !== Union{} && T <: Number ? sizeof(T)*rows*cols / Base.Sys.total_memory() : rows*cols/10^7
44

55
_default_renamecolid_function_withoutid(x, y) = "_c" * string(x)
66
_default_renamecolid_function_withid(x, y) = identity(string(values(x)))

src/other/utils.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ const FLOATS = Union{Float16, Float32, Float64}
33

44
# modified return_type to suit for our purpose
55
function return_type(f::Function, x)
6+
eltype(x) == Missing && return Missing
67
CT = nonmissingtype(eltype(x))
78
if CT <: AbstractVector
89
return return_type_tuple(f, x)

src/sort/groupby.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,6 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
174174
# the transformation returning multiple rows must not be based on the previous columns in combine
175175
# result (which seems reasonable ??)
176176
_first_vector_res = _check_mutliple_rows_for_each_group(gds.parent, ms)
177-
178177
_is_groupingcols_modifed(gds, ms) && throw(ArgumentError("`combine` cannot modify the grouping or sorting columns, use a different name for the computed column"))
179178

180179
groupcols = gds.groupcols

test/grouping.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -474,11 +474,13 @@ end
474474
@test combine(groupby(sds2, 1), :y=>sum) == Dataset(x=[1,2,3, missing], sum_y=[missing, -7.0, 11.2,missing])
475475
@test combine(gatherby(sds2, 1, isgathered = true), :y=>sum) == Dataset(x = [1,3,2,missing,3,2], sum_y=[missing, 2.2, -1,missing, 9,-6])
476476

477-
ds = Dataset(x = [1,2,1,2,3], y1 = Union{Int8, Missing}[1,2,missing,4,missing], y2 = Union{Int32, Missing}[1,2,3,4,missing], y3=Union{Int16, Missing}[100,20,3000,4,missing], y4=Float16.(rand(5)), y5=rand(BigFloat, 5))
478-
sds = view(ds, [1,2,3,4,5], [1,2,3,4,5,6])
477+
ds = Dataset(x = [1,2,1,2,3], y1 = Union{Int8, Missing}[1,2,missing,4,missing], y2 = Union{Int32, Missing}[1,2,3,4,missing], y3=Union{Int16, Missing}[100,20,3000,4,missing], y4=Float16.(rand(5)), y5=rand(BigFloat, 5), y6=[missing, missing, missing, missing, missing])
478+
sds = view(ds, [1,2,3,4,5], [1,2,3,4,5,6,7])
479479

480480
@test combine(gatherby(sds, 1), 2:4 .=>Ref([sum, mean, maximum, minimum, IMD.n, IMD.nmissing])) == Dataset([Union{Missing, Int64}[1, 2, 3], Union{Missing, Int64}[1, 6, missing], Union{Missing, Float64}[1.0, 3.0, missing], Union{Missing, Int8}[1, 4, missing], Union{Missing, Int8}[1, 2, missing], Union{Missing, Int64}[1, 2, 0], Union{Missing, Int64}[1, 0, 1], Union{Missing, Int64}[4, 6, missing], Union{Missing, Float64}[2.0, 3.0, missing], Union{Missing, Int32}[3, 4, missing], Union{Missing, Int32}[1, 2, missing], Union{Missing, Int64}[2, 2, 0], Union{Missing, Int64}[0, 0, 1], Union{Missing, Int64}[3100, 24, missing], Union{Missing, Float64}[1550.0, 12.0, missing], Union{Missing, Int16}[3000, 20, missing], Union{Missing, Int16}[100, 4, missing], Union{Missing, Int64}[2, 2, 0], Union{Missing, Int64}[0, 0, 1]], ["x", "sum_y1", "mean_y1", "maximum_y1", "minimum_y1", "n_y1", "nmissing_y1", "sum_y2", "mean_y2", "maximum_y2", "minimum_y2", "n_y2", "nmissing_y2", "sum_y3", "mean_y3", "maximum_y3", "minimum_y3", "n_y3", "nmissing_y3"])
481481

482+
@test combine(gatherby(sds,1), :y6=>sum=>:mm) == combine(gatherby(sds,1), :y6=>minimum=>:mm) == combine(gatherby(sds,1), :y6=>var=>:mm) == combine(gatherby(sds,1), :y6=>(x->sum(x))=>:mm) == combine(gatherby(sds,1), :y6=>(x->var(x))=>:mm) == combine(gatherby(sds,1), :y6=>(x->minimum(x))=>:mm) == Dataset(x = [1,2,3] , mm = [missing, missing, missing])
483+
482484
var1(x) = var(x)
483485
std1(x) = std(x)
484486
median1(x) = median(x)

test/transpose.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,13 @@ const ≅ = isequal
323323
dst = transpose(ds, :x, variable_name = nothing)
324324
dst_t = compare(ds, dst, on =[:g1=>:g1, :g2=>:g2, :x=>:_c1], mapformats = true)
325325
@test all(byrow(dst_t, all, :))
326+
327+
ds = Dataset(x = [1,2,1,2,3], y = [missing, missing, missing, missing, missing])
328+
@test transpose(groupby(ds,1), :y) == Dataset(x = [1,2,3], _variables_=["y","y","y"], _c1=[missing, missing, missing], _c2=[missing, missing, missing])
329+
@test transpose(groupby(ds,1), :y, default = 0) == Dataset(x = [1,2,3], _variables_=["y","y","y"], _c1=[missing, missing, missing], _c2=[missing, missing, 0])
330+
@test transpose(gatherby(ds,1), :y, default = 0) == Dataset(x = [1,2,3], _variables_=["y","y","y"], _c1=[missing, missing, missing], _c2=[missing, missing, 0])
331+
332+
326333
end
327334

328335

0 commit comments

Comments
 (0)