improve repeat/! performance and allow passing view

sl-solution · sl-solution · commit 57503bc339ab · 2022-01-04T20:01:38.000+13:00
diff --git a/src/dataset/other.jl b/src/dataset/other.jl
@@ -104,69 +104,79 @@ disallowmissing!(ds::Dataset, cols::Colon=:; error::Bool=false) =
     disallowmissing!(ds, axes(ds, 2), error=error)
 
 """
-    repeat!(ds::Dataset; inner::Integer = 1, outer::Integer = 1)
+    repeat!(ds::Dataset; inner::Integer = 1, outer::Integer = 1, freq = nothing)
+    repeat!(ds::Dataset, count) = repeat!(ds, outer = count)
+
+Update a data set `ds` in-place by repeating its rows.
+
+* `inner` specifies how many times each row is repeated,
+* `outer` specifies how many times the full set of rows is repeated, and
+* `freq` allow user to pass a vector of integers or a column name or index which indicate how many times the corresponding row should be repeated.
+When `freq` is passed, the values must be positive integer values or zero (zero means the corresponding row should be dropped).
 
-Update a data set `ds` in-place by repeating its rows. `inner` specifies how many
-times each row is repeated, and `outer` specifies how many times the full set
-of rows is repeated. Columns of `ds` are freshly allocated.
 
 # Example
 ```jldoctest
 julia> ds = Dataset(a = 1:2, b = 3:4)
 2×2 Dataset
- Row │ a      b
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      3
-   2 │     2      4
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        2         4
 
 julia> repeat!(ds, inner = 2, outer = 3);
 
 julia> ds
 12×2 Dataset
- Row │ a      b
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      3
-   2 │     1      3
-   3 │     2      4
-   4 │     2      4
-   5 │     1      3
-   6 │     1      3
-   7 │     2      4
-   8 │     2      4
-   9 │     1      3
-  10 │     1      3
-  11 │     2      4
-  12 │     2      4
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        1         3
+   3 │        2         4
+   4 │        2         4
+   5 │        1         3
+   6 │        1         3
+   7 │        2         4
+   8 │        2         4
+   9 │        1         3
+  10 │        1         3
+  11 │        2         4
+  12 │        2         4
+
+julia> ds = Dataset(a = 1:2, b = 3:4)
+2×2 Dataset
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        2         4
+
+julia> repeat!(ds, freq = :a)
+3×2 Dataset
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        2         4
+   3 │        2         4
 ```
 """
 function repeat!(ds::Dataset; inner::Integer = 1, outer::Integer = 1, freq::Union{AbstractVector, DatasetColumn, SubDatasetColumn, ColumnIndex, Nothing} = nothing)
-
+    T = nrow(ds) < typemax(Int8) ? Int8 : nrow(ds) < typemax(Int32) ? Int32 : Int64
 # Modify Dataset
     if freq === nothing
         inner <= 0 && throw(ArgumentError("inner keyword argument must be greater than zero"))
         outer <= 0 && throw(ArgumentError("outer keyword argument must be greater than zero"))
-        if outer == 1
-            for j in 1:ncol(ds)
-                _columns(ds)[j] = repeat(_columns(ds)[j], inner = Int(inner), outer = 1)
-            end
-            _reset_grouping_info!(ds)
-            # ngroups = index(ds).ngroups[]
-            # diffs = diff(index(ds).starts[1:ngroups]) .* inner
-            # @show diffs
-            # cumsum!(diffs, diffs)
-            # @show diffs
-            # for j in 2:ngroups
-            #     index(ds).starts[j] = diffs[j-1]
-            # end
-            # @show index(ds).starts
-        elseif outer > 1
-            for j in 1:ncol(ds)
-                _columns(ds)[j] = repeat(_columns(ds)[j], inner = Int(inner), outer = Int(outer))
-            end
-            _reset_grouping_info!(ds)
-        end
+        r_indx = repeat(T(1):T(nrow(ds)), inner = inner, outer = outer)
+        _permute_ds_after_sort!(ds, r_indx, check = false)
+        _reset_grouping_info!(ds)
+
         _modified(_attributes(ds))
         ds
     else
@@ -177,24 +187,14 @@ function repeat!(ds::Dataset; inner::Integer = 1, outer::Integer = 1, freq::Unio
         elseif freq isa AbstractVector
             lengths = freq
         end
-        if !(eltype(lengths) <: Union{Missing, Integer}) || any(ismissing, lengths) || any(x->isless(x, 1), lengths)
-            throw(ArgumentError("The column selected for repeating must be an Intger column with all values greater than zero and no missing value"))
+        if !(eltype(lengths) <: Union{Missing, Integer}) || any(ismissing, lengths) || any(x->isless(x, 0), lengths)
+            throw(ArgumentError("The column selected for repeating must be an integer column with all values greater than or equal to zero and with no missing values"))
         end
         if length(lengths) != nrow(ds)
-            throw(ArgumentError("The length of repeating weights must be the same as the number of row of the passed data set"))
-        end
-        lengths = copy(lengths)
-        total_new = sum(lengths)
-        for j in 1:ncol(ds)
-            if DataAPI.refpool(_columns(ds)[j]) !== nothing
-                _res = allocatecol(_columns(ds)[j], total_new, addmissing = false)
-                _columns(ds)[j].refs = repeat_lengths_v2!(_res.refs, DataAPI.refarray(_columns(ds)[j]), lengths)
-            else
-                _res = allocatecol(_columns(ds)[j], total_new)
-                _columns(ds)[j] = repeat_lengths_v2!(_res, _columns(ds)[j], lengths)
-            end
-
+            throw(ArgumentError("The length of frequencies must match the number of rows in passed data set"))
         end
+        r_index = _create_index_for_repeat(lengths, Val(T))
+        _permute_ds_after_sort!(ds, r_index, check = false)
         _reset_grouping_info!(ds)
         _modified(_attributes(ds))
         ds
@@ -203,62 +203,109 @@ end
 function _fill_index_for_repeat!(res, w)
     counter = 1
     for i in 1:length(w)
-
-        l = w[i]
-        fill!(view(res, counter:(counter + l - 1)),  i)
-        counter += l
+        for j in 1:w[i]
+            res[counter] = i
+            counter += 1
+        end
     end
 end
 # use this to create index for new data set
 # and then use getindex with the result of this function for repeating
 # This should be better for repeating large data set/ since getindex is threaded
-function _create_index_for_repeat(w)
-    res = Vector{Int}(undef, sum(w))
+function _create_index_for_repeat(w, ::Val{T}) where T
+    res = Vector{T}(undef, sum(w))
     _fill_index_for_repeat!(res, w)
     res
 end
 
+function repeat!(ds::Dataset, count::Integer)
+
+# Modify Dataset
+    count <= 0 && throw(ArgumentError("count must be greater than zero"))
+    repeat!(ds, inner = 1, outer = count)
+    ds
+end
+
 """
-    repeat!(ds::Dataset, count::Integer)
+    repeat(ds::AbstractDataset; inner::Integer = 1, outer::Integer = 1, freq = nothing, view = false)
+    repeat(ds::AbstractDataset, count) = repeat!(ds, outer = count, view = false)
 
-Update a data set `ds` in-place by repeating its rows the number of times
-specified by `count`. Columns of `ds` are freshly allocated.
+Variant of `repeat!` which returns a fresh copy of passed data set. If `view = true` a view of the result will be returned.
 
 # Example
 ```jldoctest
 julia> ds = Dataset(a = 1:2, b = 3:4)
 2×2 Dataset
- Row │ a      b
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      3
-   2 │     2      4
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        2         4
 
-julia> repeat(ds, 2)
-4×2 Dataset
- Row │ a      b
-     │ Int64  Int64
-─────┼──────────────
-   1 │     1      3
-   2 │     2      4
-   3 │     1      3
-   4 │     2      4
+julia> repeat(ds, inner = 2, outer = 3)
+12×2 Dataset
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        1         3
+   3 │        2         4
+   4 │        2         4
+   5 │        1         3
+   6 │        1         3
+   7 │        2         4
+   8 │        2         4
+   9 │        1         3
+  10 │        1         3
+  11 │        2         4
+  12 │        2         4
+
+julia> repeat(ds, freq = :a)
+3×2 Dataset
+ Row │ a         b
+     │ identity  identity
+     │ Int64?    Int64?
+─────┼────────────────────
+   1 │        1         3
+   2 │        2         4
+   3 │        2         4
 ```
 """
-function repeat!(ds::Dataset, count::Integer)
-
-# Modify Dataset
-    count <= 0 && throw(ArgumentError("count must be greater than zero"))
-    repeat!(ds, inner = 1, outer = count)
-    ds
-end
+Base.repeat(ds::AbstractDataset, count::Integer; view = false) = repeat(ds, outer = count, view = view)
+function Base.repeat(ds::AbstractDataset; inner::Integer = 1, outer::Integer = 1, freq = nothing, view = false)
+    T = nrow(ds) < typemax(Int8) ? Int8 : nrow(ds) < typemax(Int32) ? Int32 : Int64
 
-Base.repeat(ds::AbstractDataset, count::Integer) = repeat!(copy(ds), count)
-function Base.repeat(ds::AbstractDataset; inner::Integer = 1, outer::Integer = 1, freq = nothing)
     if freq === nothing
-        repeat!(copy(ds), inner = inner, outer = outer)
+        if view
+            inner <= 0 && throw(ArgumentError("inner keyword argument must be greater than zero"))
+            outer <= 0 && throw(ArgumentError("outer keyword argument must be greater than zero"))
+            r_indx = repeat(T(1):T(nrow(ds)), inner = inner, outer = outer)
+            Base.view(ds, r_indx, :)
+        else
+            repeat!(copy(ds), inner = inner, outer = outer)
+        end
     else
-        repeat!(copy(ds), freq = freq)
+        if view
+            if freq isa SubDatasetColumn || freq isa DatasetColumn
+                lengths = __!(freq)
+            elseif freq isa ColumnIndex
+                lengths = _columns(ds)[index(ds)[freq]]
+            elseif freq isa AbstractVector
+                lengths = freq
+            end
+            if !(eltype(lengths) <: Union{Missing, Integer}) || any(ismissing, lengths) || any(x->isless(x, 0), lengths)
+                throw(ArgumentError("The column selected for repeating must be an integer column with all values greater than or equal to zero and with no missing values"))
+            end
+            if length(lengths) != nrow(ds)
+                throw(ArgumentError("The length of frequencies must match the number of rows in passed data set"))
+            end
+            r_index = _create_index_for_repeat(lengths, Val(T))
+            Base.view(ds, r_index, :)
+        else
+            repeat!(copy(ds), freq = freq)
+        end
     end
 end
 
@@ -1169,3 +1216,30 @@ function mapcols(ds::AbstractDataset, f::Vector{T}, cols = :) where T <: Union{F
     end
     return Dataset(vs, names(ds, colsidx), copycols=false)
 end
+
+function _permute_ds_after_sort!(ds, perm; check = true, cols = :)
+    if check
+        @assert nrow(ds) == length(perm) "the length of perm and the nrow of the data set must match"
+
+        if issorted(perm)
+            return ds
+        end
+    end
+    colsidx = index(ds)[cols]
+    for j in 1:length(colsidx)
+        if DataAPI.refpool(_columns(ds)[colsidx[j]]) !== nothing
+            # if _columns(ds)[colsidx[j] isa PooledArray
+            #     pa = _columns(ds)[colsidx[j]
+            #     _columns(ds)[colsidx[j] = PooledArray(PooledArrays.RefArray(_threaded_permute(pa.refs, perm)), DataAPI.invrefpool(pa), DataAPI.refpool(pa), PooledArrays.refcount(pa))
+            # else
+            #     # TODO must be optimised
+            #     _columns(ds)[colsidx[j] = _columns(ds)[colsidx[j][perm]
+            # end
+            # since we don't support copycols for external usage it is safe to only permute refs
+            _columns(ds)[colsidx[j]].refs = _threaded_permute(_columns(ds)[colsidx[j]].refs, perm)
+        else
+            _columns(ds)[colsidx[j]] = _threaded_permute(_columns(ds)[colsidx[j]], perm)
+        end
+    end
+    _modified(_attributes(ds))
+end