Skip to content

Commit 156a943

Browse files
committed
clean up code
1 parent ba3a9fc commit 156a943

File tree

2 files changed

+0
-216
lines changed

2 files changed

+0
-216
lines changed

src/abstractdataset/abstractdataset.jl

Lines changed: 0 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,199 +1555,6 @@ julia> ncol(ds)
15551555
"""
15561556
(nrow, ncol)
15571557

1558-
1559-
"""
1560-
flatten(ds::AbstractDataset, cols)
1561-
1562-
When columns `cols` of data set `ds` have iterable elements that define
1563-
`length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
1564-
element of each `col` in `cols` is flattened, meaning the column corresponding
1565-
to `col` becomes a longer vector where the original entries are concatenated.
1566-
Elements of row `i` of `ds` in columns other than `cols` will be repeated
1567-
according to the length of `ds[i, col]`. These lengths must therefore be the
1568-
same for each `col` in `cols`, or else an error is raised. Note that these
1569-
elements are not copied, and thus if they are mutable changing them in the
1570-
returned `Dataset` will affect `ds`.
1571-
1572-
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
1573-
1574-
# Examples
1575-
1576-
```jldoctest
1577-
julia> ds1 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7, 8]])
1578-
2×3 Dataset
1579-
Row │ a b c
1580-
│ identity identity identity
1581-
│ Int64? Array…? Array…?
1582-
─────┼──────────────────────────────
1583-
1 │ 1 [1, 2] [5, 6]
1584-
2 │ 2 [3, 4] [7, 8]
1585-
1586-
julia> flatten(ds1, :b)
1587-
4×3 Dataset
1588-
Row │ a b c
1589-
│ identity identity identity
1590-
│ Int64? Int64? Array…?
1591-
─────┼──────────────────────────────
1592-
1 │ 1 1 [5, 6]
1593-
2 │ 1 2 [5, 6]
1594-
3 │ 2 3 [7, 8]
1595-
4 │ 2 4 [7, 8]
1596-
1597-
julia> flatten(ds1, [:b, :c])
1598-
4×3 Dataset
1599-
Row │ a b c
1600-
│ identity identity identity
1601-
│ Int64? Int64? Int64?
1602-
─────┼──────────────────────────────
1603-
1 │ 1 1 5
1604-
2 │ 1 2 6
1605-
3 │ 2 3 7
1606-
4 │ 2 4 8
1607-
1608-
julia> ds2 = Dataset(a = [1, 2], b = [("p", "q"), ("r", "s")])
1609-
2×2 Dataset
1610-
Row │ a b
1611-
│ identity identity
1612-
│ Int64? Tuple…?
1613-
─────┼──────────────────────
1614-
1 │ 1 ("p", "q")
1615-
2 │ 2 ("r", "s")
1616-
1617-
julia> flatten(ds2, :b)
1618-
4×2 Dataset
1619-
Row │ a b
1620-
│ identity identity
1621-
│ Int64? String?
1622-
─────┼────────────────────
1623-
1 │ 1 p
1624-
2 │ 1 q
1625-
3 │ 2 r
1626-
4 │ 2 s
1627-
1628-
julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
1629-
2×3 Dataset
1630-
Row │ a b c
1631-
│ identity identity identity
1632-
│ Int64? Array…? Array…?
1633-
─────┼──────────────────────────────
1634-
1 │ 1 [1, 2] [5, 6]
1635-
2 │ 2 [3, 4] [7]
1636-
1637-
julia> flatten(ds3, [:b, :c])
1638-
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
1639-
```
1640-
"""
1641-
flatten(ds, cols)
1642-
#
1643-
# function flatten(ds::AbstractDataset,
1644-
# cols::Union{ColumnIndex, MultiColumnIndex})
1645-
# # Create Dataset
1646-
# _check_consistency(ds)
1647-
#
1648-
# idxcols = index(ds)[cols]
1649-
# isempty(idxcols) && return copy(ds)
1650-
# col1 = first(idxcols)
1651-
# lengths = length.(_columns(ds)[col1])
1652-
# for col in idxcols
1653-
# v = _columns(ds)[col]
1654-
# if any(x -> length(x[1]) != x[2], zip(v, lengths))
1655-
# r = findfirst(x -> x != 0, length.(v) .- lengths)
1656-
# colnames = _names(ds)
1657-
# throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
1658-
# "and :$(colnames[col]) are not the same in row $r"))
1659-
# end
1660-
# end
1661-
#
1662-
# new_ds = similar(ds[!, Not(cols)], sum(lengths))
1663-
# for name in _names(new_ds)
1664-
# repeat_lengths!(new_ds[!, name].val, ds[!, name].val, lengths)
1665-
# end
1666-
# length(idxcols) > 1 && sort!(idxcols)
1667-
# for col in idxcols
1668-
# col_to_flatten = _columns(ds)[col]
1669-
# flattened_col = col_to_flatten isa AbstractVector{<:AbstractVector} ?
1670-
# reduce(vcat, col_to_flatten) :
1671-
# collect(Iterators.flatten(col_to_flatten))
1672-
#
1673-
# insertcols!(new_ds, col, _names(ds)[col] => flattened_col)
1674-
# end
1675-
# setformat!(new_ds, index(ds).format)
1676-
# setinfo!(new_ds, _attributes(ds).meta.info[])
1677-
# _reset_grouping_info!(new_ds)
1678-
# new_ds
1679-
# # TODO actually the grouping info can be kept but needs more work, since the starts would change
1680-
# # if idxcols ∈ Ref(index(ds).sortedcols)
1681-
# # return new_ds
1682-
# # else
1683-
# # _copy_grouping_info!(new_ds, ds)
1684-
# # return new_ds
1685-
# # end
1686-
# end
1687-
1688-
1689-
_ELTYPE(x) = eltype(x)
1690-
_ELTYPE(::Missing) = Missing
1691-
_LENGTH(x) = length(x)
1692-
_LENGTH(::Missing) = 1
1693-
1694-
function flatten(ds::AbstractDataset,
1695-
cols::Union{ColumnIndex, MultiColumnIndex})
1696-
_check_consistency(ds)
1697-
1698-
idxcols = index(ds)[cols]
1699-
isempty(idxcols) && return copy(ds)
1700-
col1 = first(idxcols)
1701-
lengths = _LENGTH.(_columns(ds)[col1])
1702-
for col in idxcols
1703-
v = _columns(ds)[col]
1704-
if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
1705-
r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
1706-
colnames = _names(ds)
1707-
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
1708-
"and :$(colnames[col]) are not the same in row $r"))
1709-
end
1710-
end
1711-
new_total = sum(lengths)
1712-
new_ds = similar(ds[!, Not(cols)], new_total)
1713-
for name in _names(new_ds)
1714-
repeat_lengths_v2!(new_ds[!, name].val, ds[!, name].val, lengths)
1715-
end
1716-
length(idxcols) > 1 && sort!(idxcols)
1717-
for col in idxcols
1718-
col_to_flatten = _columns(ds)[col]
1719-
T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
1720-
_res = allocatecol(T, new_total)
1721-
_fill_flatten!(_res, col_to_flatten, lengths)
1722-
insertcols!(new_ds, col, _names(ds)[col] => _res)
1723-
end
1724-
setformat!(new_ds, copy(index(ds).format))
1725-
setinfo!(new_ds, _attributes(ds).meta.info[])
1726-
_reset_grouping_info!(new_ds)
1727-
new_ds
1728-
end
1729-
1730-
1731-
function _fill_flatten!_barrier(_res, val, counter)
1732-
for j in val
1733-
_res[counter] = j
1734-
counter += 1
1735-
end
1736-
counter
1737-
end
1738-
1739-
function _fill_flatten!(_res, col_to_flatten, lengths)
1740-
counter = 1
1741-
for i in 1:length(col_to_flatten)
1742-
if ismissing(col_to_flatten[i])
1743-
_res[counter] = missing
1744-
counter += 1
1745-
else
1746-
counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter)
1747-
end
1748-
end
1749-
end
1750-
17511558
function repeat_lengths_v2!(longnew::AbstractVector, shortold::AbstractVector,
17521559
lengths)
17531560
counter = 1

src/sort/util.jl

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,3 @@ function _mark_start_of_groups_sorted!(inbits, x, lo, hi, o, ::Val{T}) where T
118118
end
119119
end
120120
end
121-
122-
function _permute_ds_after_sort!(ds, perm)
123-
@assert nrow(ds) == length(perm) "the length of perm and the nrow of the data set must match"
124-
if issorted(perm)
125-
return ds
126-
end
127-
for j in 1:ncol(ds)
128-
if DataAPI.refpool(_columns(ds)[j]) !== nothing
129-
# if _columns(ds)[j] isa PooledArray
130-
# pa = _columns(ds)[j]
131-
# _columns(ds)[j] = PooledArray(PooledArrays.RefArray(_threaded_permute(pa.refs, perm)), DataAPI.invrefpool(pa), DataAPI.refpool(pa), PooledArrays.refcount(pa))
132-
# else
133-
# # TODO must be optimised
134-
# _columns(ds)[j] = _columns(ds)[j][perm]
135-
# end
136-
# since we don't support copycols for external usage it is safe to only permute refs
137-
_columns(ds)[j].refs = _threaded_permute(_columns(ds)[j].refs, perm)
138-
else
139-
_columns(ds)[j] = _threaded_permute(_columns(ds)[j], perm)
140-
end
141-
end
142-
_modified(_attributes(ds))
143-
end

0 commit comments

Comments
 (0)