Skip to content

Commit 1a34561

Browse files
committed
bug fix - byrow(nunique)
1 parent 0ace5f3 commit 1a34561

File tree

3 files changed

+36
-20
lines changed

3 files changed

+36
-20
lines changed

src/byrow/byrow.jl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,14 @@ byrow(ds::AbstractDataset, ::typeof(var), col::ColumnIndex; by = identity, dof =
167167
byrow(ds::AbstractDataset, ::typeof(std), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = row_std(ds, by, cols; dof = dof, threads = threads)
168168
byrow(ds::AbstractDataset, ::typeof(std), col::ColumnIndex; by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = byrow(ds, std, [col]; by = by, dof = dof, threads = threads)
169169

170-
byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true) = row_nunique(ds, by, cols; count_missing = count_missing)
170+
function byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true, threads=nrow(ds)>1000)
171+
res = byrow(ds, x->length(Set(Base.Generator(by, x))), cols, threads=threads)
172+
if count_missing
173+
return res
174+
else
175+
return res .- row_any(ds, ismissing, cols)
176+
end
177+
end
171178
byrow(ds::AbstractDataset, ::typeof(nunique), col::ColumnIndex; by = identity, count_missing = true) = byrow(ds, nunique, [col]; by = by, count_missing = count_missing)
172179

173180

src/byrow/row_functions.jl

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,25 +1077,26 @@ function _fill_dict_and_add!(init0, dict, prehashed, n, p)
10771077
end
10781078
end
10791079

1080-
function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true)
1081-
colsidx = multiple_getindex(index(ds), cols)
1082-
prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx))
1083-
allcols = view(_columns(ds),colsidx)
1084-
1085-
for j in 1:size(prehashed,2)
1086-
_fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j)
1087-
end
1088-
1089-
init0 = zeros(Int32, size(ds,1))
1090-
dict = Dict{_Prehashed, Nothing}()
1091-
_fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx))
1092-
if count_missing
1093-
return init0
1094-
else
1095-
return init0 .- row_any(ds, ismissing, cols)
1096-
end
1097-
end
1098-
row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing)
1080+
# This is not working - because we only the hash values and in many cases like 2.1 and 4611911198408756429 the hash is the same
1081+
# function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true)
1082+
# colsidx = multiple_getindex(index(ds), cols)
1083+
# prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx))
1084+
# allcols = view(_columns(ds),colsidx)
1085+
1086+
# for j in 1:size(prehashed,2)
1087+
# _fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j)
1088+
# end
1089+
1090+
# init0 = zeros(Int32, size(ds,1))
1091+
# dict = Dict{_Prehashed, Nothing}()
1092+
# _fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx))
1093+
# if count_missing
1094+
# return init0
1095+
# else
1096+
# return init0 .- row_any(ds, ismissing, cols)
1097+
# end
1098+
# end
1099+
# row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing)
10991100

11001101
Base.@propagate_inbounds function _op_for_hash!(x, y, f, lo, hi)
11011102
@simd for i in lo:hi

test/byrow.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,4 +414,12 @@ end
414414
@test byrow(ds, fun123, (1,2,3)) == [1,-1.0,-9,2.5]
415415
fun123_2(x,y) = x == 1 && y < 0 ? true : false
416416
@test byrow(ds, fun123_2, (:x1, :x2)) == [false, false, true, false]
417+
end
418+
419+
@testset "byrow - nunique" begin
420+
ds = Dataset(x=2.1, y=4611911198408756429, z=missing, k=-2.1)
421+
@test byrow(ds, nunique, :)[1] == 4
422+
@test byrow(ds, nunique, :, count_missing = false)[1] == 3
423+
@test byrow(ds, nunique, :, by = abs)[1] == 3
424+
@test byrow(ds, nunique, :, by = abs, count_missing=false)[1] == 2
417425
end

0 commit comments

Comments
 (0)