@@ -1555,199 +1555,6 @@ julia> ncol(ds)
15551555"""
15561556(nrow, ncol)
15571557
1558-
1559- """
1560- flatten(ds::AbstractDataset, cols)
1561-
1562- When columns `cols` of data set `ds` have iterable elements that define
1563- `length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
1564- element of each `col` in `cols` is flattened, meaning the column corresponding
1565- to `col` becomes a longer vector where the original entries are concatenated.
1566- Elements of row `i` of `ds` in columns other than `cols` will be repeated
1567- according to the length of `ds[i, col]`. These lengths must therefore be the
1568- same for each `col` in `cols`, or else an error is raised. Note that these
1569- elements are not copied, and thus if they are mutable changing them in the
1570- returned `Dataset` will affect `ds`.
1571-
1572- `cols` can be any column selector ($COLUMNINDEX_STR ; $MULTICOLUMNINDEX_STR ).
1573-
1574- # Examples
1575-
1576- ```jldoctest
1577- julia> ds1 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7, 8]])
1578- 2×3 Dataset
1579- Row │ a b c
1580- │ identity identity identity
1581- │ Int64? Array…? Array…?
1582- ─────┼──────────────────────────────
1583- 1 │ 1 [1, 2] [5, 6]
1584- 2 │ 2 [3, 4] [7, 8]
1585-
1586- julia> flatten(ds1, :b)
1587- 4×3 Dataset
1588- Row │ a b c
1589- │ identity identity identity
1590- │ Int64? Int64? Array…?
1591- ─────┼──────────────────────────────
1592- 1 │ 1 1 [5, 6]
1593- 2 │ 1 2 [5, 6]
1594- 3 │ 2 3 [7, 8]
1595- 4 │ 2 4 [7, 8]
1596-
1597- julia> flatten(ds1, [:b, :c])
1598- 4×3 Dataset
1599- Row │ a b c
1600- │ identity identity identity
1601- │ Int64? Int64? Int64?
1602- ─────┼──────────────────────────────
1603- 1 │ 1 1 5
1604- 2 │ 1 2 6
1605- 3 │ 2 3 7
1606- 4 │ 2 4 8
1607-
1608- julia> ds2 = Dataset(a = [1, 2], b = [("p", "q"), ("r", "s")])
1609- 2×2 Dataset
1610- Row │ a b
1611- │ identity identity
1612- │ Int64? Tuple…?
1613- ─────┼──────────────────────
1614- 1 │ 1 ("p", "q")
1615- 2 │ 2 ("r", "s")
1616-
1617- julia> flatten(ds2, :b)
1618- 4×2 Dataset
1619- Row │ a b
1620- │ identity identity
1621- │ Int64? String?
1622- ─────┼────────────────────
1623- 1 │ 1 p
1624- 2 │ 1 q
1625- 3 │ 2 r
1626- 4 │ 2 s
1627-
1628- julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
1629- 2×3 Dataset
1630- Row │ a b c
1631- │ identity identity identity
1632- │ Int64? Array…? Array…?
1633- ─────┼──────────────────────────────
1634- 1 │ 1 [1, 2] [5, 6]
1635- 2 │ 2 [3, 4] [7]
1636-
1637- julia> flatten(ds3, [:b, :c])
1638- ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
1639- ```
1640- """
1641- flatten (ds, cols)
1642- #
1643- # function flatten(ds::AbstractDataset,
1644- # cols::Union{ColumnIndex, MultiColumnIndex})
1645- # # Create Dataset
1646- # _check_consistency(ds)
1647- #
1648- # idxcols = index(ds)[cols]
1649- # isempty(idxcols) && return copy(ds)
1650- # col1 = first(idxcols)
1651- # lengths = length.(_columns(ds)[col1])
1652- # for col in idxcols
1653- # v = _columns(ds)[col]
1654- # if any(x -> length(x[1]) != x[2], zip(v, lengths))
1655- # r = findfirst(x -> x != 0, length.(v) .- lengths)
1656- # colnames = _names(ds)
1657- # throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
1658- # "and :$(colnames[col]) are not the same in row $r"))
1659- # end
1660- # end
1661- #
1662- # new_ds = similar(ds[!, Not(cols)], sum(lengths))
1663- # for name in _names(new_ds)
1664- # repeat_lengths!(new_ds[!, name].val, ds[!, name].val, lengths)
1665- # end
1666- # length(idxcols) > 1 && sort!(idxcols)
1667- # for col in idxcols
1668- # col_to_flatten = _columns(ds)[col]
1669- # flattened_col = col_to_flatten isa AbstractVector{<:AbstractVector} ?
1670- # reduce(vcat, col_to_flatten) :
1671- # collect(Iterators.flatten(col_to_flatten))
1672- #
1673- # insertcols!(new_ds, col, _names(ds)[col] => flattened_col)
1674- # end
1675- # setformat!(new_ds, index(ds).format)
1676- # setinfo!(new_ds, _attributes(ds).meta.info[])
1677- # _reset_grouping_info!(new_ds)
1678- # new_ds
1679- # # TODO actually the grouping info can be kept but needs more work, since the starts would change
1680- # # if idxcols ∈ Ref(index(ds).sortedcols)
1681- # # return new_ds
1682- # # else
1683- # # _copy_grouping_info!(new_ds, ds)
1684- # # return new_ds
1685- # # end
1686- # end
1687-
1688-
1689- _ELTYPE (x) = eltype (x)
1690- _ELTYPE (:: Missing ) = Missing
1691- _LENGTH (x) = length (x)
1692- _LENGTH (:: Missing ) = 1
1693-
1694- function flatten (ds:: AbstractDataset ,
1695- cols:: Union{ColumnIndex, MultiColumnIndex} )
1696- _check_consistency (ds)
1697-
1698- idxcols = index (ds)[cols]
1699- isempty (idxcols) && return copy (ds)
1700- col1 = first (idxcols)
1701- lengths = _LENGTH .(_columns (ds)[col1])
1702- for col in idxcols
1703- v = _columns (ds)[col]
1704- if any (x -> _LENGTH (x[1 ]) != x[2 ], zip (v, lengths))
1705- r = findfirst (x -> x != 0 , _LENGTH .(v) .- lengths)
1706- colnames = _names (ds)
1707- throw (ArgumentError (" Lengths of iterables stored in columns :$(colnames[col1]) " *
1708- " and :$(colnames[col]) are not the same in row $r " ))
1709- end
1710- end
1711- new_total = sum (lengths)
1712- new_ds = similar (ds[! , Not (cols)], new_total)
1713- for name in _names (new_ds)
1714- repeat_lengths_v2! (new_ds[! , name]. val, ds[! , name]. val, lengths)
1715- end
1716- length (idxcols) > 1 && sort! (idxcols)
1717- for col in idxcols
1718- col_to_flatten = _columns (ds)[col]
1719- T = mapreduce (_ELTYPE, promote_type, col_to_flatten)
1720- _res = allocatecol (T, new_total)
1721- _fill_flatten! (_res, col_to_flatten, lengths)
1722- insertcols! (new_ds, col, _names (ds)[col] => _res)
1723- end
1724- setformat! (new_ds, copy (index (ds). format))
1725- setinfo! (new_ds, _attributes (ds). meta. info[])
1726- _reset_grouping_info! (new_ds)
1727- new_ds
1728- end
1729-
1730-
1731- function _fill_flatten!_barrier (_res, val, counter)
1732- for j in val
1733- _res[counter] = j
1734- counter += 1
1735- end
1736- counter
1737- end
1738-
1739- function _fill_flatten! (_res, col_to_flatten, lengths)
1740- counter = 1
1741- for i in 1 : length (col_to_flatten)
1742- if ismissing (col_to_flatten[i])
1743- _res[counter] = missing
1744- counter += 1
1745- else
1746- counter = _fill_flatten!_barrier (_res, col_to_flatten[i], counter)
1747- end
1748- end
1749- end
1750-
17511558function repeat_lengths_v2! (longnew:: AbstractVector , shortold:: AbstractVector ,
17521559 lengths)
17531560 counter = 1
0 commit comments