clean up code

sl-solution · sl-solution · commit 156a94393d16 · 2022-01-04T20:01:04.000+13:00
diff --git a/src/abstractdataset/abstractdataset.jl b/src/abstractdataset/abstractdataset.jl
@@ -1555,199 +1555,6 @@ julia> ncol(ds)
 """
 (nrow, ncol)
 
-
-"""
-    flatten(ds::AbstractDataset, cols)
-
-When columns `cols` of data set `ds` have iterable elements that define
-`length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
-element of each `col` in `cols` is flattened, meaning the column corresponding
-to `col` becomes a longer vector where the original entries are concatenated.
-Elements of row `i` of `ds` in columns other than `cols` will be repeated
-according to the length of `ds[i, col]`. These lengths must therefore be the
-same for each `col` in `cols`, or else an error is raised. Note that these
-elements are not copied, and thus if they are mutable changing them in the
-returned `Dataset` will affect `ds`.
-
-`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-
-# Examples
-
-```jldoctest
-julia> ds1 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7, 8]])
-2×3 Dataset
- Row │ a         b         c
-     │ identity  identity  identity
-     │ Int64?    Array…?   Array…?
-─────┼──────────────────────────────
-   1 │        1  [1, 2]    [5, 6]
-   2 │        2  [3, 4]    [7, 8]
-
-julia> flatten(ds1, :b)
-4×3 Dataset
- Row │ a         b         c
-     │ identity  identity  identity
-     │ Int64?    Int64?    Array…?
-─────┼──────────────────────────────
-   1 │        1         1  [5, 6]
-   2 │        1         2  [5, 6]
-   3 │        2         3  [7, 8]
-   4 │        2         4  [7, 8]
-
-julia> flatten(ds1, [:b, :c])
-4×3 Dataset
- Row │ a         b         c
-     │ identity  identity  identity
-     │ Int64?    Int64?    Int64?
-─────┼──────────────────────────────
-   1 │        1         1         5
-   2 │        1         2         6
-   3 │        2         3         7
-   4 │        2         4         8
-
-julia> ds2 = Dataset(a = [1, 2], b = [("p", "q"), ("r", "s")])
-2×2 Dataset
- Row │ a         b
-     │ identity  identity
-     │ Int64?    Tuple…?
-─────┼──────────────────────
-   1 │        1  ("p", "q")
-   2 │        2  ("r", "s")
-
-julia> flatten(ds2, :b)
-4×2 Dataset
- Row │ a         b
-     │ identity  identity
-     │ Int64?    String?
-─────┼────────────────────
-   1 │        1  p
-   2 │        1  q
-   3 │        2  r
-   4 │        2  s
-
-julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
-2×3 Dataset
- Row │ a         b         c
-     │ identity  identity  identity
-     │ Int64?    Array…?   Array…?
-─────┼──────────────────────────────
-   1 │        1  [1, 2]    [5, 6]
-   2 │        2  [3, 4]    [7]
-
-julia> flatten(ds3, [:b, :c])
-ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
-```
-"""
-flatten(ds, cols)
-#
-# function flatten(ds::AbstractDataset,
-#                  cols::Union{ColumnIndex, MultiColumnIndex})
-#     # Create Dataset
-#     _check_consistency(ds)
-#
-#     idxcols = index(ds)[cols]
-#     isempty(idxcols) && return copy(ds)
-#     col1 = first(idxcols)
-#     lengths = length.(_columns(ds)[col1])
-#     for col in idxcols
-#         v = _columns(ds)[col]
-#         if any(x -> length(x[1]) != x[2], zip(v, lengths))
-#             r = findfirst(x -> x != 0, length.(v) .- lengths)
-#             colnames = _names(ds)
-#             throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-#                                 "and :$(colnames[col]) are not the same in row $r"))
-#         end
-#     end
-#
-#     new_ds = similar(ds[!, Not(cols)], sum(lengths))
-#     for name in _names(new_ds)
-#         repeat_lengths!(new_ds[!, name].val, ds[!, name].val, lengths)
-#     end
-#     length(idxcols) > 1 && sort!(idxcols)
-#     for col in idxcols
-#         col_to_flatten = _columns(ds)[col]
-#         flattened_col = col_to_flatten isa AbstractVector{<:AbstractVector} ?
-#             reduce(vcat, col_to_flatten) :
-#             collect(Iterators.flatten(col_to_flatten))
-#
-#         insertcols!(new_ds, col, _names(ds)[col] => flattened_col)
-#     end
-#     setformat!(new_ds, index(ds).format)
-#     setinfo!(new_ds, _attributes(ds).meta.info[])
-#     _reset_grouping_info!(new_ds)
-#     new_ds
-#     # TODO actually the grouping info can be kept but needs more work, since the starts would change
-#     # if idxcols ∈ Ref(index(ds).sortedcols)
-#     #     return new_ds
-#     # else
-#     #     _copy_grouping_info!(new_ds, ds)
-#     #     return new_ds
-#     # end
-# end
-
-
-_ELTYPE(x) = eltype(x)
-_ELTYPE(::Missing) = Missing
-_LENGTH(x) = length(x)
-_LENGTH(::Missing) = 1
-
-function flatten(ds::AbstractDataset,
-                 cols::Union{ColumnIndex, MultiColumnIndex})
-     _check_consistency(ds)
-
-     idxcols = index(ds)[cols]
-     isempty(idxcols) && return copy(ds)
-     col1 = first(idxcols)
-     lengths = _LENGTH.(_columns(ds)[col1])
-     for col in idxcols
-         v = _columns(ds)[col]
-         if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
-             r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
-             colnames = _names(ds)
-             throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                 "and :$(colnames[col]) are not the same in row $r"))
-         end
-     end
-     new_total = sum(lengths)
-     new_ds = similar(ds[!, Not(cols)], new_total)
-     for name in _names(new_ds)
-        repeat_lengths_v2!(new_ds[!, name].val, ds[!, name].val, lengths)
-     end
-     length(idxcols) > 1 && sort!(idxcols)
-     for col in idxcols
-         col_to_flatten = _columns(ds)[col]
-         T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
-         _res = allocatecol(T, new_total)
-         _fill_flatten!(_res, col_to_flatten, lengths)
-         insertcols!(new_ds, col, _names(ds)[col] => _res)
-     end
-     setformat!(new_ds, copy(index(ds).format))
-     setinfo!(new_ds, _attributes(ds).meta.info[])
-     _reset_grouping_info!(new_ds)
-     new_ds
-end
-
-
-function _fill_flatten!_barrier(_res, val, counter)
-    for j in val
-        _res[counter] = j
-        counter += 1
-    end
-    counter
-end
-
-function _fill_flatten!(_res, col_to_flatten, lengths)
-    counter = 1
-    for i in 1:length(col_to_flatten)
-        if ismissing(col_to_flatten[i])
-            _res[counter] = missing
-            counter += 1
-        else
-            counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter)
-        end
-    end
-end
-
 function repeat_lengths_v2!(longnew::AbstractVector, shortold::AbstractVector,
                          lengths)
     counter = 1
diff --git a/src/sort/util.jl b/src/sort/util.jl
@@ -118,26 +118,3 @@ function _mark_start_of_groups_sorted!(inbits, x, lo, hi, o, ::Val{T}) where T
         end
     end
 end
-
-function _permute_ds_after_sort!(ds, perm)
-    @assert nrow(ds) == length(perm) "the length of perm and the nrow of the data set must match"
-    if issorted(perm)
-        return ds
-    end
-    for j in 1:ncol(ds)
-        if DataAPI.refpool(_columns(ds)[j]) !== nothing
-            # if _columns(ds)[j] isa PooledArray
-            #     pa = _columns(ds)[j]
-            #     _columns(ds)[j] = PooledArray(PooledArrays.RefArray(_threaded_permute(pa.refs, perm)), DataAPI.invrefpool(pa), DataAPI.refpool(pa), PooledArrays.refcount(pa))
-            # else
-            #     # TODO must be optimised
-            #     _columns(ds)[j] = _columns(ds)[j][perm]
-            # end
-            # since we don't support copycols for external usage it is safe to only permute refs
-            _columns(ds)[j].refs = _threaded_permute(_columns(ds)[j].refs, perm)
-        else
-            _columns(ds)[j] = _threaded_permute(_columns(ds)[j], perm)
-        end
-    end
-    _modified(_attributes(ds))
-end