supporting byrow with tuple of column indices

sl-solution · sl-solution · commit 038587b11cae · 2022-07-07T18:04:18.000+12:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 ## New features
 
+* A new functionality has been added to `byrow` for passing a Tuple of column indices. `byrow(ds, fun, cols)` calls `fun.(ds[:, cols[1]], ds[:, cols[2]], ...)` when `cols` is a NTuple of column indices.
+
+# Version 0.7.6
+
+## New features
+
 * Two new functions: `delete` and `delete!`. They should be compared to `filter` and `filter!`, respectively - [issue #63](https://github.com/sl-solution/InMemoryDatasets.jl/issues/63)
 * Add `DLMReader` to `sysimage` in `IMD.create_sysimage`.
 
diff --git a/docs/src/man/byrow.md b/docs/src/man/byrow.md
@@ -237,7 +237,9 @@ One special function that can be used as `fun` in the `byrow` function is `mapre
 
 ## User defined operations
 
-For user defined functions which return a single value, `byrow` treats each row as a vector of values, thus the user defined function must accept a vector and returns a single value. For instance to calculate `1 * col1 + 2 * col2 + 3 * col3` for each row in `ds` we can define the following function:
+For user defined functions which return a single value, `byrow` treats each row as a vector of values, thus the user defined function must accept a vector and returns a single value.
+However, when user defines a multivariate function and pass a Tuple of column indices as the `cols` argument of `byrow`, the `byrow` function simply calls `fun.(ds[:, cols[1]], ds[:, cols2], ...)`.
+For instance to calculate `1 * col1 + 2 * col2 + 3 * col3` for each row in `ds` we can define the following function:
 
 ```jldoctest
 julia> avg(x) = 1 * x[1] + 2 * x[2] + 3 * x[3]
@@ -258,6 +260,31 @@ julia> byrow(ds, avg, 1:3)
 
 Note that `avg` is missing if any of the values in `x` is missing.
 
+Below is an example of using `byrow` with a user defined multivariate function
+
+```jldoctest
+julia> ds = Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4)
+4×3 Dataset
+ Row │ x1        x2        x3       
+     │ identity  identity  identity 
+     │ Int64?    Int64?    Int64?   
+─────┼──────────────────────────────
+   1 │        1         1         1
+   2 │        2        -2         2
+   3 │        1        -3         3
+   4 │        2        10         4
+
+julia> fun(x,y,z)::Float64 = x == 1 ? y*z : y/z
+fun (generic function with 1 method)
+
+julia> byrow(ds, fun, (:x1, :x2, :x3))
+4-element Vector{Real}:
+  1.0
+ -1.0
+ -9.0
+  2.5
+```
+
 ## Special operations
 
 `byrow` also supports a few optimised operations which return a vector of values for each row. The `fun` argument for these operations is one of the followings:
diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl
@@ -200,6 +200,13 @@ function byrow(ds::AbstractDataset, f::Function, cols::MultiColumnIndex; threads
 	length(colsidx) == 1 && return byrow(ds, f, colsidx[1]; threads = threads)
 	threads ?  hp_row_generic(ds, f, cols) : row_generic(ds, f, cols)
 end
+
+# TODO do we need to make sure that the result is Union of Missing?
+function byrow(ds::AbstractDataset, f::Function, cols::NTuple{N, ColumnIndex}) where N
+	cols_idx = [index(ds)[cols[i]] for i in 1:length(cols)]
+	f.(view(_columns(ds), cols_idx)...)
+end
+
 function byrow(ds::AbstractDataset, f::Function, col::ColumnIndex; threads = nrow(ds)>1000, allowmissing::Bool = true)
 	if threads
 		T = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(ds[!, col]))})
diff --git a/src/byrow/doc.jl b/src/byrow/doc.jl
@@ -1238,8 +1238,9 @@ Return the result of calling `fun` on each row of `ds` selected by `cols`. The `
 
 When user passes a type as `fun` and a single column as `cols`,  `byrow` convert the corresponding column to the type specified by `fun`.
 
-For generic functions there are two special cases:
+For generic functions there are the below special cases:
 
 * When `cols` is a single column, `byrow(ds, fun, cols)` acts like `fun.(ds[:, cols])`
 * When `cols` is referring to exactly two columns and it is possible to pass two vectors as arguments of `fun`, `byrow` returns `fun.(ds[:, col1], ds[:, col2])` when possible.
+* When `cols` is a `Tuple` of column indices, `byrow(ds, fun, cols)` returns `fun.(ds[:, cols[1]], ds[:, cols[2]], ...)`, i.e. `fun` is a multivariate function which will be applied on each row of `cols`.
 """
diff --git a/src/dataset/combine.jl b/src/dataset/combine.jl
@@ -27,6 +27,73 @@ function normalize_combine!(outidx::Index, idx,
     return ntuple(i -> _names(idx)[idx[src[i]]], N) => fun => Symbol(dst)
 end
 
+# this is add to support byrow for multivariate functions
+# (col1, col2) => byrow(fun) => dst, the job is to create (col1, col2) => byrow(fun) => :dst
+function normalize_combine!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Pair{<:Vector{Expr},
+                                <:Union{Symbol, AbstractString}}})
+                                ) where N
+    src = sel.first
+    if sel.second.first[1].head == :BYROW
+        _check_ind_and_add!(outidx, Symbol(sel.second.second))
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second.first[1] => Symbol(sel.second.second)
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_combine!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Pair{<:Expr,
+                                <:Union{Symbol, AbstractString}}})
+                                ) where N
+    src = sel.first
+    if sel.second.first.head == :BYROW
+        _check_ind_and_add!(outidx, Symbol(sel.second.second))
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second.first[1] => Symbol(sel.second.second)
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_combine!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Vector{Expr}})
+                                ) where N
+    src = sel.first
+    N < 2 && throw(ArgumentError("For multivariate functions (Tuple of column names), the number of input columns must be greater than 1"))
+    col1, col2 = outidx[src[1]], outidx[src[2]]
+    var1, var2 = _names(outidx)[col1], _names(outidx)[col2]
+    if sel.second[1].head == :BYROW
+        if N > 2
+            nname = Symbol(funname(sel.second[1].args[1]), "_", var1, "_", var2, "_etc")
+        else
+            nname = Symbol(funname(sel.second[1].args[1]), "_", var1, "_", var2)
+        end
+        _check_ind_and_add!(outidx, nname)
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second[1] => nname
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_combine!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Expr})
+                                ) where N
+    src = sel.first
+    N < 2 && throw(ArgumentError("For multivariate functions (Tuple of column names), the number of input columns must be greater than 1"))
+    col1, col2 = outidx[src[1]], outidx[src[2]]
+    var1, var2 = _names(outidx)[col1], _names(outidx)[col2]
+    if sel.second.head == :BYROW
+        if N > 2
+            nname = Symbol(funname(sel.second.args[1]), "_", var1, "_", var2, "_etc")
+        else
+            nname = Symbol(funname(sel.second.args[1]), "_", var1, "_", var2)
+        end
+        _check_ind_and_add!(outidx, nname)
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second => nname
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+
+
+
 # col => fun, the job is to create col => fun => :colname
 function normalize_combine!(outidx::Index, idx,
                             @nospecialize(sel::Pair{<:ColumnIndex,
@@ -244,8 +311,12 @@ function _is_byrow_valid(idx, ms)
     end
     for i in 1:length(ms)
         if (ms[i].second.first isa Expr) && ms[i].second.first.head == :BYROW
-
-            byrow_vars = idx[ms[i].first]
+            # if the input vars are supposed to be used in a multivariate function
+            if ms[i].first isa Tuple
+                byrow_vars = [idx[ms[i].first[j]] for j in 1:length(ms[i].first)]
+            else
+                byrow_vars = idx[ms[i].first]
+            end
             !all(byrow_vars .∈ Ref(righthands)) && return false
         end
         if haskey(idx, ms[i].second.second)
@@ -258,7 +329,7 @@ end
 function _check_mutliple_rows_for_each_group(ds, ms)
     for i in 1:length(ms)
         # byrow are not checked since they are not going to modify the number of rows
-        if ms[i].first isa Tuple
+        if ms[i].first isa Tuple && !(ms[i].second.first isa Expr)
             T = return_type(ms[i].second.first, ntuple(j-> ds[!, ms[i].first[j]].val, length(ms[i].first)))
             if T <: AbstractVector && T !== Union{}
                 return i
@@ -670,7 +741,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
                 _combine_f_barrier_special(special_res, ds[!, ms[i].first].val, newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, _first_vector_res,ngroups, new_lengths, total_lengths, threads)
             end
         else
-            if ms[i].first isa Tuple
+            if ms[i].first isa Tuple && !(ms[i].second.first isa Expr)
                 _combine_f_barrier_tuple(ntuple(j->_columns(ds)[index(ds)[ms[i].first[j]]], length(ms[i].first)), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
             else
                 _combine_f_barrier(haskey(index(ds).lookup, ms[i].first) ? _columns(ds)[index(ds)[ms[i].first]] : _columns(ds)[1], newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
@@ -753,7 +824,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
                 _combine_f_barrier_special(special_res, ds[!, ms[i].first].val, newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, _first_vector_res,ngroups, new_lengths, total_lengths, threads)
             end
         else
-            if ms[i].first isa Tuple
+            if ms[i].first isa Tuple && !(ms[i].second.first isa Expr)
                 _combine_f_barrier_tuple(ntuple(j->_columns(ds)[index(ds)[ms[i].first[j]]], length(ms[i].first)), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
             else
                 _combine_f_barrier(haskey(index(ds), ms[i].first) ? _columns(ds)[index(ds)[ms[i].first]] : _columns(ds)[1], newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
diff --git a/src/dataset/modify.jl b/src/dataset/modify.jl
@@ -58,6 +58,71 @@ function normalize_modify!(outidx::Index, idx,
     return ntuple(i->outidx[src[i]], N) => fun => Symbol(dst)
 end
 
+# this is add to support byrow for multivariate functions
+# (col1, col2) => byrow(fun) => dst, the job is to create (col1, col2) => byrow(fun) => :dst
+function normalize_modify!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Pair{<:Vector{Expr},
+                                <:Union{Symbol, AbstractString}}})
+                                ) where N
+    src = sel.first
+    if sel.second.first[1].head == :BYROW
+        _check_ind_and_add!(outidx, Symbol(sel.second.second))
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second.first[1] => Symbol(sel.second.second)
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_modify!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Pair{<:Expr,
+                                <:Union{Symbol, AbstractString}}})
+                                ) where N
+    src = sel.first
+    if sel.second.first.head == :BYROW
+        _check_ind_and_add!(outidx, Symbol(sel.second.second))
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second.first[1] => Symbol(sel.second.second)
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_modify!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Vector{Expr}})
+                                ) where N
+    src = sel.first
+    N < 2 && throw(ArgumentError("For multivariate functions (Tuple of column names), the number of input columns must be greater than 1"))
+    col1, col2 = outidx[src[1]], outidx[src[2]]
+    var1, var2 = _names(outidx)[col1], _names(outidx)[col2]
+    if sel.second[1].head == :BYROW
+        if N > 2
+            nname = Symbol(funname(sel.second[1].args[1]), "_", var1, "_", var2, "_etc")
+        else
+            nname = Symbol(funname(sel.second[1].args[1]), "_", var1, "_", var2)
+        end
+        _check_ind_and_add!(outidx, nname)
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second[1] => nname
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+function normalize_modify!(outidx::Index, idx,
+    @nospecialize(sel::Pair{<:NTuple{N, ColumnIndex},
+                            <:Expr})
+                                ) where N
+    src = sel.first
+    N < 2 && throw(ArgumentError("For multivariate functions (Tuple of column names), the number of input columns must be greater than 1"))
+    col1, col2 = outidx[src[1]], outidx[src[2]]
+    var1, var2 = _names(outidx)[col1], _names(outidx)[col2]
+    if sel.second.head == :BYROW
+        if N > 2
+            nname = Symbol(funname(sel.second.args[1]), "_", var1, "_", var2, "_etc")
+        else
+            nname = Symbol(funname(sel.second.args[1]), "_", var1, "_", var2)
+        end
+        _check_ind_and_add!(outidx, nname)
+        return ntuple(i->outidx[src[i]], length(src)) => sel.second => nname
+    end
+    throw(ArgumentError("only byrow is accepted when using expressions"))
+end
+
 # col => fun, the job is to create col => fun => :colname
 function normalize_modify!(outidx::Index, idx,
                             @nospecialize(sel::Pair{<:ColumnIndex,
diff --git a/src/sort/groupby.jl b/src/sort/groupby.jl
@@ -177,7 +177,6 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
 	# if this is not the case, throw ArgumentError and ask user to use modify instead
 	newlookup, new_nm = _create_index_for_newds(gds.parent, ms, gds.groupcols)
 	!(_is_byrow_valid(Index(newlookup, new_nm, Dict{Int, Function}()), ms)) && throw(ArgumentError("`byrow` must be used for aggregated columns, use `modify` otherwise"))
-
 	if _fast_gatherby_reduction(gds, ms)
 		return _combine_fast_gatherby_reduction(gds, ms, newlookup, new_nm; dropgroupcols = dropgroupcols, threads = threads)
 	end
@@ -263,13 +262,13 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
 		end
 
 		if i == _first_vector_res
-			if ms[i].first isa Tuple
+			if ms[i].first isa Tuple && !(ms[i].second.first isa Expr)
 				_combine_f_barrier_special_tuple(special_res, ntuple(j-> view(_columns(gds.parent)[index(gds.parent)[ms[i].first[j]]], a[1]), length(ms[i].first)), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, _first_vector_res,ngroups, new_lengths, total_lengths, threads)
 			else
 				_combine_f_barrier_special(special_res, view(_columns(gds.parent)[index(gds.parent)[ms[i].first]], a[1]), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, _first_vector_res,ngroups, new_lengths, total_lengths, threads)
 			end
 		else
-			if ms[i].first isa Tuple
+			if ms[i].first isa Tuple && !(ms[i].second.first isa Expr)
 				_combine_f_barrier_tuple(ntuple(j-> _threaded_permute_for_groupby(_columns(gds.parent)[index(gds.parent)[ms[i].first[j]]], a[1], threads = threads), length(ms[i].first)), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
 			else
 				_combine_f_barrier(!(ms[i].second.first isa Expr) && haskey(index(gds.parent), ms[i].first) ? curr_x : view(_columns(gds.parent)[1], a[1]), newds, ms[i].first, ms[i].second.first, ms[i].second.second, newds_lookup, starts, ngroups, new_lengths, total_lengths, threads)
diff --git a/test/byrow.jl b/test/byrow.jl
@@ -407,3 +407,11 @@ end
     @test all(byrow(ds2, issorted, :))
     @test Matrix(ds2) == sort(Matrix(ds), dims = 2)
 end
+
+@testset "byrow with NTuple as cols" begin
+    ds = Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4)
+    fun123(x,y,z) = x == 1 ? y*z : y/z
+    @test byrow(ds, fun123, (1,2,3)) == [1,-1.0,-9,2.5]
+    fun123_2(x,y) = x == 1 && y < 0 ? true : false
+    @test byrow(ds, fun123_2, (:x1, :x2)) == [false, false, true, false]
+end
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -612,5 +612,21 @@ end
 
 end
 
-
-
+@testset "byrow with tuple input" begin
+    ds = Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4)
+    res = modify(ds, (1,2) => byrow((x,y)-> x==1 && y<0 ? true : false))
+    @test res ==  Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4, function_x1_x2=[false, false, true, false])
+    res = modify(view(ds, [1,2,3,4], [1,2,3]), (1,2) => byrow((x,y)-> x==1 && y<0 ? true : false))
+    @test res ==  Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4, function_x1_x2=[false, false, true, false])
+    res = modify(groupby(ds, 3), (1,2) => byrow((x,y)-> x==1 && y<0 ? true : false))
+    @test res ==  Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4, function_x1_x2=[false, false, true, false])
+    res = modify(gatherby(ds, 3), (1,2) => byrow((x,y)-> x==1 && y<0 ? true : false))
+    @test res ==  Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4, function_x1_x2=[false, false, true, false])
+    res = modify(gatherby(ds, 3), (1,2) => byrow((x,y)-> x==1 && y<0 ? true : false)=>:newvar)
+    @test res ==  Dataset(x1 = [1,2,1,2], x2 = [1,-2,-3,10], x3 = 1:4, newvar=[false, false, true, false])
+
+    res = combine(groupby(ds, 1), 2 => IMD.minimum, :x3 => IMD.minimum, (:minimum_x2, :minimum_x3) => byrow((x,y)->x/y))
+    @test res == Dataset(x1 = [1,2], minimum_x2 = [-3,-2], minimum_x3 = [1,2], function_minimum_x2_minimum_x3 = [-3.0, -1.0]) 
+    res = combine(groupby(ds, 1), 2 => IMD.minimum, :x3 => IMD.minimum, (:minimum_x2, :minimum_x3) => byrow((x,y)->x/y) => :newvar)
+    @test res == Dataset(x1 = [1,2], minimum_x2 = [-3,-2], minimum_x3 = [1,2], newvar = [-3.0, -1.0]) 
+end