Skip to content

Commit c6f4bb1

Browse files
committed
address #69
1 parent 43f7da7 commit c6f4bb1

File tree

2 files changed

+157
-19
lines changed

2 files changed

+157
-19
lines changed

src/byrow/byrow.jl

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ function expand_Base_Fix(f, f2)
3131
end
3232
end
3333

34-
function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
34+
function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
3535
colsidx = multiple_getindex(index(ds), cols)
3636
if by isa AbstractVector
3737
if mapformats
@@ -42,13 +42,20 @@ function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; b
4242
by = map(y->expand_Base_Fix(by, getformat(ds, y)), colsidx)
4343
end
4444
end
45+
if !ismissing(missings)
46+
if by isa AbstractVector
47+
by = map(y -> x -> ismissing(x) ? missings : y(x), by)
48+
else
49+
by = first(map(y -> x -> ismissing(x) ? missings : y(x), [by]))
50+
end
51+
end
4552
row_any(ds, by, colsidx, threads = threads)
4653

4754
end
4855

49-
byrow(ds::AbstractDataset, ::typeof(any), col::ColumnIndex; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, any, [col]; by = by, threads = threads, mapformats = mapformats)
56+
byrow(ds::AbstractDataset, ::typeof(any), col::ColumnIndex; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, any, [col]; missings = missings, by = by, threads = threads, mapformats = mapformats)
5057

51-
function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
58+
function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
5259
colsidx = multiple_getindex(index(ds), cols)
5360
if by isa AbstractVector
5461
if mapformats
@@ -59,9 +66,16 @@ function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; b
5966
by = map(y->expand_Base_Fix(by, getformat(ds, y)), colsidx)
6067
end
6168
end
69+
if !ismissing(missings)
70+
if by isa AbstractVector
71+
by = map(y -> x -> ismissing(x) ? missings : y(x), by)
72+
else
73+
by = first(map(y -> x -> ismissing(x) ? missings : y(x), [by]))
74+
end
75+
end
6276
row_all(ds, by, colsidx, threads = threads)
6377
end
64-
byrow(ds::AbstractDataset, ::typeof(all), col::ColumnIndex; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, all, [col]; by = by, threads = threads, mapformats = mapformats)
78+
byrow(ds::AbstractDataset, ::typeof(all), col::ColumnIndex; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, all, [col]; missings = missings, by = by, threads = threads, mapformats = mapformats)
6579

6680
byrow(ds::AbstractDataset, ::typeof(isequal), cols::MultiColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)
6781
byrow(ds::AbstractDataset, ::typeof(isequal), cols::ColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)

src/dataset/other.jl

Lines changed: 139 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -968,12 +968,14 @@ nmissing(x) = count(ismissing, x)
968968
n(x) = count(!ismissing, x)
969969

970970
"""
971-
filter(ds::AbstractDataset, cols; [type = all,...])
971+
filter(ds::AbstractDataset, cols; [missings = missing, type = all,...])
972972
973973
A convenient shortcut for `ds[byrow(ds, type, cols; ...), :]`.
974974
975975
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
976976
977+
The `missings` keyword argument controls how the missing values should be treated in `filter`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
978+
977979
See [`byrow`](@ref), [`filter!`](@ref), [`delete!`](@ref), [`delete`](@ref)
978980
979981
# Examples
@@ -1036,42 +1038,106 @@ julia> filter(ds, 2:3, type = isless, with = :x)
10361038
1 │ 3 -1.0 true
10371039
2 │ 4 0.0 false
10381040
3 │ 5 2.0 true
1041+
1042+
julia> ds = Dataset(x = [1,2,missing,4,5], y = [missing,missing,-1,0,2.0], z = [true,missing,true,false,true])
1043+
5×3 Dataset
1044+
Row │ x y z
1045+
│ identity identity identity
1046+
│ Int64? Float64? Bool?
1047+
─────┼───────────────────────────────
1048+
1 │ 1 missing true
1049+
2 │ 2 missing missing
1050+
3 │ missing -1.0 true
1051+
4 │ 4 0.0 false
1052+
5 │ 5 2.0 true
1053+
1054+
julia> filter(ds, :z, missings = true) # treat missing values as true
1055+
4×3 Dataset
1056+
Row │ x y z
1057+
│ identity identity identity
1058+
│ Int64? Float64? Bool?
1059+
─────┼───────────────────────────────
1060+
1 │ 1 missing true
1061+
2 │ 2 missing missing
1062+
3 │ missing -1.0 true
1063+
4 │ 5 2.0 true
1064+
1065+
julia> filter(ds, :z, missings = false) # treat missing values as false
1066+
3×3 Dataset
1067+
Row │ x y z
1068+
│ identity identity identity
1069+
│ Int64? Float64? Bool?
1070+
─────┼───────────────────────────────
1071+
1 │ 1 missing true
1072+
2 │ missing -1.0 true
1073+
3 │ 5 2.0 true
1074+
10391075
```
10401076
"""
1041-
function Base.filter(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, Vector{T}, ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString}
1077+
function Base.filter(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, Vector{T}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, view = false, type= all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString}
1078+
if type in (all, any)
1079+
idx = byrow(ds, type, cols; missings = missings, kwargs...)
1080+
idx_val = _findall(idx)
1081+
else
1082+
idx = byrow(ds, type, cols; kwargs...)
1083+
if !ismissing(missings)
1084+
replace!(idx, missing => missings)
1085+
idx_val = _findall(disallowmissing(idx))
1086+
else
1087+
idx_val = _findall(idx)
1088+
end
1089+
end
10421090
if view
1043-
Base.view(ds, byrow(ds, type, cols; kwargs...), :)
1091+
Base.view(ds, idx_val, :)
10441092
else
1045-
ds[byrow(ds, type, cols; kwargs...), :]
1093+
ds[idx_val, :]
10461094
end
10471095
end
10481096
"""
1049-
filter!(ds::AbstractDataset, cols; [type = all, ...])
1097+
filter!(ds::AbstractDataset, cols; [missings = missing, type = all, ...])
10501098
10511099
Variant of `filter` which replaces the passed data set with the filtered one.
10521100
10531101
It is a convenient shortcut for `deleteat![ds, .!byrow(ds, type, cols; ...)]`.
10541102
10551103
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
10561104
1105+
The `missings` keyword argument controls how the missing values should be treated in `filter!`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
1106+
10571107
Refer to [`filter`](@ref) for exmaples.
10581108
10591109
See [`byrow`](@ref), [`filter`](@ref), [`delete!`](@ref), [`delete`](@ref)
10601110
"""
1061-
_filter!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, AbstractVector{T}, ColumnIndex, MultiColumnIndex}; type = all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString} = deleteat!(ds, .!byrow(ds, type, cols; kwargs...))
1062-
Base.filter!(ds::Dataset, cols::AbstractVector; type = all, kwargs...) = _filter!(ds, cols; type = type, kwargs...)
1063-
Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = _filter!(ds, cols; type = type, kwargs...)
1064-
Base.filter!(ds::Dataset, cols::NTuple{N, ColumnIndex}; kwargs...) where N = _filter!(ds, cols; kwargs...)
1111+
function _filter!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, AbstractVector{T}, ColumnIndex, MultiColumnIndex}; missings = missing, type = all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString}
1112+
if type in (all, any)
1113+
idx = byrow(ds, type, cols; missings = missings, kwargs...)
1114+
idx_val = _findall(.!idx)
1115+
else
1116+
idx = byrow(ds, type, cols; kwargs...)
1117+
if !ismissing(missings)
1118+
replace!(idx, missing => missings)
1119+
idx_val = _findall(.!disallowmissing(idx))
1120+
else
1121+
idx_val = _findall(.!idx)
1122+
end
1123+
end
1124+
deleteat!(ds, idx_val)
1125+
end
1126+
Base.filter!(ds::Dataset, cols::AbstractVector; missings::Union{Missing, Bool} = missing, type = all, kwargs...) = _filter!(ds, cols; missings = missings, type = type, kwargs...)
1127+
Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, type = all, kwargs...) = _filter!(ds, cols; missings = missings, type = type, kwargs...)
1128+
Base.filter!(ds::Dataset, cols::NTuple{N, ColumnIndex}; missings::Union{Missing, Bool} = missing, kwargs...) where N = _filter!(ds, cols; missings = missings, kwargs...)
10651129

10661130

10671131
# filter out `true`s
10681132
"""
1069-
delete(ds::AbstractDataset, cols; [type = all,...])
1133+
delete(ds::AbstractDataset, cols; [missings = missing, type = all,...])
10701134
10711135
A convenient shortcut for `ds[.!byrow(ds, type, cols; ...), :]`.
10721136
10731137
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
10741138
1139+
The `missings` keyword argument controls how the missing values should be treated in `delete`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
1140+
10751141
Compare to [`deleteat!`](@ref)
10761142
10771143
See [`delete!`](@ref), [`byrow`](@ref), [`filter!`](@ref), [`filter`](@ref)
@@ -1139,31 +1205,89 @@ julia> delete(ds, 2:3, type = isless, with = :x)
11391205
─────┼──────────────────────────────
11401206
1 │ 1 1.5 true
11411207
2 │ 2 2.3 false
1208+
1209+
julia> ds = Dataset(x = [1,2,missing,4,5], y = [missing,missing,-1,0,2.0], z = [true,missing,true,false,true])
1210+
5×3 Dataset
1211+
Row │ x y z
1212+
│ identity identity identity
1213+
│ Int64? Float64? Bool?
1214+
─────┼───────────────────────────────
1215+
1 │ 1 missing true
1216+
2 │ 2 missing missing
1217+
3 │ missing -1.0 true
1218+
4 │ 4 0.0 false
1219+
5 │ 5 2.0 true
1220+
1221+
julia> delete(ds, :z, missings = true) # treat missing values as true
1222+
1×3 Dataset
1223+
Row │ x y z
1224+
│ identity identity identity
1225+
│ Int64? Float64? Bool?
1226+
─────┼──────────────────────────────
1227+
1 │ 4 0.0 false
1228+
1229+
julia> delete(ds, :z, missings = false) # treat missing values as false
1230+
2×3 Dataset
1231+
Row │ x y z
1232+
│ identity identity identity
1233+
│ Int64? Float64? Bool?
1234+
─────┼───────────────────────────────
1235+
1 │ 2 missing missing
1236+
2 │ 4 0.0 false
1237+
11421238
```
11431239
"""
1144-
function delete(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...) where N
1240+
function delete(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, view = false, type= all, kwargs...) where N
1241+
if type in (all, any)
1242+
idx = byrow(ds, type, cols; missings = missings, kwargs...)
1243+
idx_val = _findall(.!idx)
1244+
else
1245+
idx = byrow(ds, type, cols; kwargs...)
1246+
if !ismissing(missings)
1247+
replace!(idx, missing => missings)
1248+
idx_val = _findall(.!disallowmissing(idx))
1249+
else
1250+
idx_val = _findall(.!idx)
1251+
end
1252+
end
11451253
if view
1146-
Base.view(ds, .!byrow(ds, type, cols; kwargs...), :)
1254+
Base.view(ds, idx_val, :)
11471255
else
1148-
ds[.!byrow(ds, type, cols; kwargs...), :]
1256+
ds[idx_val, :]
11491257
end
11501258
end
11511259
"""
1152-
delete!(ds::AbstractDataset, cols; [type = all, ...])
1260+
delete!(ds::AbstractDataset, cols; [missings = missing, type = all, ...])
11531261
11541262
Variant of `delete` which replaces the passed data set with the filtered one.
11551263
11561264
It is a convenient shortcut for `deleteat![ds, byrow(ds, type, cols; ...)]`.
11571265
11581266
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
11591267
1268+
The `missings` keyword argument controls how the missing values should be treated in `delete!`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
1269+
11601270
Compare to [`deleteat!`](@ref)
11611271
11621272
Refer to [`delete`](@ref) for exmaples.
11631273
11641274
See [`delete`](@ref), [`byrow`](@ref), [`filter`](@ref), [`filter!`](@ref)
11651275
"""
1166-
Base.delete!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; type = all, kwargs...) where N = deleteat!(ds, byrow(ds, type, cols; kwargs...))
1276+
function Base.delete!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, type = all, kwargs...) where N
1277+
if type in (all, any)
1278+
idx = byrow(ds, type, cols; missings = missings, kwargs...)
1279+
idx_val = _findall(idx)
1280+
else
1281+
idx = byrow(ds, type, cols; kwargs...)
1282+
if !ismissing(missings)
1283+
replace!(idx, missing => missings)
1284+
idx_val = _findall(disallowmissing(idx))
1285+
else
1286+
idx_val = _findall(idx)
1287+
end
1288+
end
1289+
deleteat!(ds, idx_val)
1290+
end
11671291

11681292
"""
11691293
mapcols(ds::AbstractDataset, f, cols)

0 commit comments

Comments
 (0)