address #69

sl-solution · sl-solution · commit c6f4bb15da40 · 2022-08-10T20:06:08.000+12:00
diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl
@@ -31,7 +31,7 @@ function expand_Base_Fix(f, f2)
 	end
 end
 
-function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
+function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
 	colsidx = multiple_getindex(index(ds), cols)
 	if by isa AbstractVector
 		if mapformats
@@ -42,13 +42,20 @@ function byrow(ds::AbstractDataset, ::typeof(any), cols::MultiColumnIndex = :; b
 			by = map(y->expand_Base_Fix(by, getformat(ds, y)), colsidx)
 		end
 	end
+	if !ismissing(missings)
+		if by isa AbstractVector
+			by = map(y -> x -> ismissing(x) ? missings : y(x), by)
+		else
+			by = first(map(y -> x -> ismissing(x) ? missings : y(x), [by]))
+		end
+	end
 	row_any(ds, by, colsidx, threads = threads)
 
 end
 
-byrow(ds::AbstractDataset, ::typeof(any), col::ColumnIndex; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, any, [col]; by = by, threads = threads, mapformats = mapformats)
+byrow(ds::AbstractDataset, ::typeof(any), col::ColumnIndex; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, any, [col]; missings = missings, by = by, threads = threads, mapformats = mapformats)
 
-function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
+function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false)
 	colsidx =  multiple_getindex(index(ds), cols)
 	if by isa AbstractVector
 		if mapformats
@@ -59,9 +66,16 @@ function byrow(ds::AbstractDataset, ::typeof(all), cols::MultiColumnIndex = :; b
 			by = map(y->expand_Base_Fix(by, getformat(ds, y)), colsidx)
 		end
 	end
+	if !ismissing(missings)
+		if by isa AbstractVector
+			by = map(y -> x -> ismissing(x) ? missings : y(x), by)
+		else
+			by = first(map(y -> x -> ismissing(x) ? missings : y(x), [by]))
+		end
+	end
 	row_all(ds, by, colsidx, threads = threads)
 end
-byrow(ds::AbstractDataset, ::typeof(all), col::ColumnIndex; by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, all, [col]; by = by, threads = threads, mapformats = mapformats)
+byrow(ds::AbstractDataset, ::typeof(all), col::ColumnIndex; missings = missing, by = isequal(true), threads = nrow(ds) > Threads.nthreads()*10, mapformats = false) = byrow(ds, all, [col]; missings = missings, by = by, threads = threads, mapformats = mapformats)
 
 byrow(ds::AbstractDataset, ::typeof(isequal), cols::MultiColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)
 byrow(ds::AbstractDataset, ::typeof(isequal), cols::ColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)
diff --git a/src/dataset/other.jl b/src/dataset/other.jl
@@ -968,12 +968,14 @@ nmissing(x) = count(ismissing, x)
 n(x) = count(!ismissing, x)
 
 """
-    filter(ds::AbstractDataset, cols; [type = all,...])
+    filter(ds::AbstractDataset, cols; [missings = missing, type = all,...])
 
 A convenient shortcut for `ds[byrow(ds, type, cols; ...), :]`.
 
 `type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
 
+The `missings` keyword argument controls how the missing values should be treated in `filter`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
+
 See [`byrow`](@ref), [`filter!`](@ref), [`delete!`](@ref), [`delete`](@ref)
 
 # Examples
@@ -1036,42 +1038,106 @@ julia> filter(ds, 2:3, type = isless, with = :x)
    1 │        3      -1.0      true
    2 │        4       0.0     false
    3 │        5       2.0      true
+
+julia> ds = Dataset(x = [1,2,missing,4,5], y = [missing,missing,-1,0,2.0], z = [true,missing,true,false,true])
+5×3 Dataset
+ Row │ x         y          z        
+     │ identity  identity   identity 
+     │ Int64?    Float64?   Bool?    
+─────┼───────────────────────────────
+   1 │        1  missing        true
+   2 │        2  missing     missing 
+   3 │  missing       -1.0      true
+   4 │        4        0.0     false
+   5 │        5        2.0      true
+
+julia> filter(ds, :z, missings = true) # treat missing values as true
+4×3 Dataset
+ Row │ x         y          z        
+     │ identity  identity   identity 
+     │ Int64?    Float64?   Bool?    
+─────┼───────────────────────────────
+   1 │        1  missing        true
+   2 │        2  missing     missing 
+   3 │  missing       -1.0      true
+   4 │        5        2.0      true
+
+julia> filter(ds, :z, missings = false) # treat missing values as false
+3×3 Dataset
+ Row │ x         y          z        
+     │ identity  identity   identity 
+     │ Int64?    Float64?   Bool?    
+─────┼───────────────────────────────
+   1 │        1  missing        true
+   2 │  missing       -1.0      true
+   3 │        5        2.0      true
+
 ```
 """
-function Base.filter(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, Vector{T}, ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString}
+function Base.filter(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, Vector{T}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, view = false, type= all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString}
+    if type in (all, any)
+        idx = byrow(ds, type, cols; missings = missings, kwargs...)
+        idx_val = _findall(idx)
+    else
+        idx = byrow(ds, type, cols; kwargs...)
+        if !ismissing(missings)
+            replace!(idx, missing => missings)
+            idx_val = _findall(disallowmissing(idx))
+        else
+            idx_val = _findall(idx)
+        end
+    end
     if view
-        Base.view(ds, byrow(ds, type, cols; kwargs...), :)
+       Base.view(ds, idx_val, :)
     else
-        ds[byrow(ds, type, cols; kwargs...), :]
+        ds[idx_val, :]
     end
 end
 """
-    filter!(ds::AbstractDataset, cols; [type = all, ...])
+    filter!(ds::AbstractDataset, cols; [missings = missing, type = all, ...])
 
 Variant of `filter` which replaces the passed data set with the filtered one.
 
 It is a convenient shortcut for `deleteat![ds, .!byrow(ds, type, cols; ...)]`.
 
 `type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
 
+The `missings` keyword argument controls how the missing values should be treated in `filter!`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
+
 Refer to [`filter`](@ref) for exmaples.
 
 See [`byrow`](@ref), [`filter`](@ref), [`delete!`](@ref), [`delete`](@ref)
 """
-_filter!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, AbstractVector{T}, ColumnIndex, MultiColumnIndex}; type = all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString} = deleteat!(ds, .!byrow(ds, type, cols; kwargs...)) 
-Base.filter!(ds::Dataset, cols::AbstractVector; type = all, kwargs...) = _filter!(ds, cols; type = type, kwargs...)
-Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = _filter!(ds, cols; type = type, kwargs...)
-Base.filter!(ds::Dataset, cols::NTuple{N, ColumnIndex}; kwargs...) where N = _filter!(ds, cols; kwargs...)
+function _filter!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, AbstractVector{T}, ColumnIndex, MultiColumnIndex}; missings = missing, type = all, kwargs...) where N where T <: Union{<:Integer, Symbol, AbstractString} 
+    if type in (all, any)
+        idx = byrow(ds, type, cols; missings = missings, kwargs...)
+        idx_val = _findall(.!idx)
+    else
+        idx = byrow(ds, type, cols; kwargs...)
+        if !ismissing(missings)
+            replace!(idx, missing => missings)
+            idx_val = _findall(.!disallowmissing(idx))
+        else
+            idx_val = _findall(.!idx)
+        end
+    end
+    deleteat!(ds, idx_val)
+end 
+Base.filter!(ds::Dataset, cols::AbstractVector; missings::Union{Missing, Bool} = missing, type = all, kwargs...) = _filter!(ds, cols; missings = missings, type = type, kwargs...)
+Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, type = all, kwargs...) = _filter!(ds, cols; missings = missings, type = type, kwargs...)
+Base.filter!(ds::Dataset, cols::NTuple{N, ColumnIndex}; missings::Union{Missing, Bool} = missing, kwargs...) where N = _filter!(ds, cols; missings = missings, kwargs...)
 
 
 # filter out `true`s
 """
-    delete(ds::AbstractDataset, cols; [type = all,...])
+    delete(ds::AbstractDataset, cols; [missings = missing, type = all,...])
 
 A convenient shortcut for `ds[.!byrow(ds, type, cols; ...), :]`.
 
 `type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
 
+The `missings` keyword argument controls how the missing values should be treated in `delete`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
+
 Compare to [`deleteat!`](@ref)
 
 See [`delete!`](@ref), [`byrow`](@ref), [`filter!`](@ref), [`filter`](@ref)
@@ -1139,31 +1205,89 @@ julia> delete(ds, 2:3, type = isless, with = :x)
 ─────┼──────────────────────────────
    1 │        1       1.5      true
    2 │        2       2.3     false
+
+julia> ds = Dataset(x = [1,2,missing,4,5], y = [missing,missing,-1,0,2.0], z = [true,missing,true,false,true])
+5×3 Dataset
+ Row │ x         y          z        
+     │ identity  identity   identity 
+     │ Int64?    Float64?   Bool?    
+─────┼───────────────────────────────
+   1 │        1  missing        true
+   2 │        2  missing     missing 
+   3 │  missing       -1.0      true
+   4 │        4        0.0     false
+   5 │        5        2.0      true
+
+julia> delete(ds, :z, missings = true) # treat missing values as true
+1×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        4       0.0     false
+
+julia> delete(ds, :z, missings = false) # treat missing values as false
+2×3 Dataset
+ Row │ x         y          z        
+     │ identity  identity   identity 
+     │ Int64?    Float64?   Bool?    
+─────┼───────────────────────────────
+   1 │        2  missing     missing 
+   2 │        4        0.0     false
+
 ```
 """
-function delete(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...) where N
+function delete(ds::AbstractDataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, view = false, type= all, kwargs...) where N
+    if type in (all, any)
+        idx = byrow(ds, type, cols; missings = missings, kwargs...)
+        idx_val = _findall(.!idx)
+    else
+        idx = byrow(ds, type, cols; kwargs...)
+        if !ismissing(missings)
+            replace!(idx, missing => missings)
+            idx_val = _findall(.!disallowmissing(idx))
+        else
+            idx_val = _findall(.!idx)
+        end
+    end
     if view
-        Base.view(ds, .!byrow(ds, type, cols; kwargs...), :)
+        Base.view(ds, idx_val, :)
     else
-        ds[.!byrow(ds, type, cols; kwargs...), :]
+        ds[idx_val, :]
     end
 end
 """
-    delete!(ds::AbstractDataset, cols; [type = all, ...])
+    delete!(ds::AbstractDataset, cols; [missings = missing, type = all, ...])
 
 Variant of `delete` which replaces the passed data set with the filtered one.
 
 It is a convenient shortcut for `deleteat![ds, byrow(ds, type, cols; ...)]`.
 
 `type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
 
+The `missings` keyword argument controls how the missing values should be treated in `delete!`. Setting `missings = false` treats missing values as `false` and setting it as `true` treats missing values as `true`.
+
 Compare to [`deleteat!`](@ref)
 
 Refer to [`delete`](@ref) for exmaples.
 
 See [`delete`](@ref), [`byrow`](@ref), [`filter`](@ref), [`filter!`](@ref)
 """
-Base.delete!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; type = all, kwargs...) where N = deleteat!(ds, byrow(ds, type, cols; kwargs...))
+function Base.delete!(ds::Dataset, cols::Union{NTuple{N, ColumnIndex}, ColumnIndex, MultiColumnIndex}; missings::Union{Missing, Bool} = missing, type = all, kwargs...) where N
+    if type in (all, any)
+        idx = byrow(ds, type, cols; missings = missings, kwargs...)
+        idx_val = _findall(idx)
+    else
+        idx = byrow(ds, type, cols; kwargs...)
+        if !ismissing(missings)
+            replace!(idx, missing => missings)
+            idx_val = _findall(disallowmissing(idx))
+        else
+            idx_val = _findall(idx)
+        end
+    end
+    deleteat!(ds, idx_val)
+end
 
 """
     mapcols(ds::AbstractDataset, f, cols)