Skip to content

Commit d9be560

Browse files
committed
Revert "Revert "support mapformats in flatten/!""
This reverts commit 53788e6.
1 parent 53788e6 commit d9be560

File tree

3 files changed

+169
-26
lines changed

3 files changed

+169
-26
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
* Users now can choose between having the observations ids for the left data set and/or the right data set as part of the output data set.
66
* Add a new function `eachgroup`. It allows iteration over each group of a grouped data set.
77
* `op` is a new keyword argument for the `update/!` functions which allows passing a user defined function to control how the value of the main data set should be updated by the values from the transaction data set. ([issue #55](https://github.com/sl-solution/InMemoryDatasets.jl/issues/55))
8+
* Supporting of the `mapformats` keyword argument in `flatten/!`. Now users can flatten a data set based on the formatted values.
89

910
## Fixes
1011

1112
* The `combine` function will now work fine when a view of data set is passed
1213
* For the join functions the `makeunique` argument is now passed correctly to the inside functions.
1314
* `update` and `update!` have the same `mode` option by default.
15+
* Fix the problem with preserving format of `SubDataset` in `flatten/!`
16+
* Fix the problem that caused `flatten!` to produce a copy of data when an empty data set were passed to it.
1417

1518
# Version 0.7.0
1619

src/dataset/transpose.jl

Lines changed: 156 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ Base.transpose(ds::Union{GroupBy, GatherBy}, cols::Tuple; id = nothing, renameco
593593

594594

595595
"""
596-
flatten(ds::AbstractDataset, cols)
596+
flatten(ds::AbstractDataset, cols; mapformats = false)
597597
598598
When columns `cols` of data set `ds` have iterable elements that define
599599
`length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
@@ -605,6 +605,8 @@ same for each `col` in `cols`, or else an error is raised. Note that these
605605
elements are not copied, and thus if they are mutable changing them in the
606606
returned `Dataset` will affect `ds`.
607607
608+
When `mapformats = true`, the function uses the formatted values of `cols`.
609+
608610
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
609611
610612
See [`flatten!`](@ref)
@@ -674,34 +676,140 @@ julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
674676
675677
julia> flatten(ds3, [:b, :c])
676678
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
679+
680+
julia> ds = Dataset(x=1:3, y=["ab,cd", "e", missing], z=[[1,2], 2, 3])
681+
3×3 Dataset
682+
Row │ x y z
683+
│ identity identity identity
684+
│ Int64? String? Any
685+
─────┼──────────────────────────────
686+
1 │ 1 ab,cd [1, 2]
687+
2 │ 2 e 2
688+
3 │ 3 missing 3
689+
690+
julia> fmt(x) = split(x, ",")
691+
fmt (generic function with 2 methods)
692+
693+
julia> fmt(::Missing) = missing
694+
fmt (generic function with 2 methods)
695+
696+
julia> setformat!(ds, :y => fmt)
697+
3×3 Dataset
698+
Row │ x y z
699+
│ identity fmt identity
700+
│ Int64? String? Any
701+
─────┼───────────────────────────────────────────────────
702+
1 │ 1 SubString{String}["ab", "cd"] [1, 2]
703+
2 │ 2 SubString{String}["e"] 2
704+
3 │ 3 missing 3
705+
706+
julia> flatten(ds, :y)
707+
7×3 Dataset
708+
Row │ x y z
709+
│ identity identity identity
710+
│ Int64? Char? Any
711+
─────┼──────────────────────────────
712+
1 │ 1 a [1, 2]
713+
2 │ 1 b [1, 2]
714+
3 │ 1 , [1, 2]
715+
4 │ 1 c [1, 2]
716+
5 │ 1 d [1, 2]
717+
6 │ 2 e 2
718+
7 │ 3 missing 3
719+
720+
julia> flatten(ds, :y, mapformats = true)
721+
4×3 Dataset
722+
Row │ x y z
723+
│ identity identity identity
724+
│ Int64? SubStrin…? Any
725+
─────┼────────────────────────────────
726+
1 │ 1 ab [1, 2]
727+
2 │ 1 cd [1, 2]
728+
3 │ 2 e 2
729+
4 │ 3 missing 3
730+
731+
julia> flatten(ds, 2:3, mapformats = true)
732+
4×3 Dataset
733+
Row │ x y z
734+
│ identity identity identity
735+
│ Int64? SubStrin…? Int64?
736+
─────┼────────────────────────────────
737+
1 │ 1 ab 1
738+
2 │ 1 cd 2
739+
3 │ 2 e 2
740+
4 │ 3 missing 3
677741
```
678742
"""
679743
flatten(ds, cols)
680744

681745
"""
682-
flatten!(ds, cols)
746+
flatten!(ds, cols; mapformats = false)
683747
684748
Variant of `flatten` that does flatten `ds` in-place.
685749
"""
686750
flatten!
687751

688-
_ELTYPE(x) = eltype(x)
689-
_ELTYPE(::Missing) = Missing
690-
_LENGTH(x) = length(x)
691-
_LENGTH(::Missing) = 1
752+
function _ELTYPE(x; fmt = identity)
753+
if fmt == identity
754+
eltype(x)
755+
else
756+
eltype(fmt(x))
757+
end
758+
end
759+
function _ELTYPE(x::Missing; fmt = identity)
760+
if fmt == identity
761+
Missing
762+
elseif ismissing(fmt(x))
763+
Missing
764+
else
765+
eltype(fmt(x))
766+
end
767+
end
768+
769+
770+
function _LENGTH(x; fmt = identity)
771+
if fmt == identity
772+
res = length(x)
773+
else
774+
res = length(fmt(x))
775+
end
776+
res
777+
end
778+
779+
function _LENGTH(x::Missing; fmt = identity)
780+
if fmt == identity
781+
res = 1
782+
elseif ismissing(fmt(x))
783+
res = 1
784+
else
785+
res = length(fmt(x))
786+
end
787+
res
788+
end
789+
692790

693791
function flatten!(ds::Dataset,
694-
cols::Union{ColumnIndex, MultiColumnIndex})
792+
cols::Union{ColumnIndex, MultiColumnIndex}; mapformats = false)
695793
_check_consistency(ds)
696794

697795
idxcols = index(ds)[cols]
698-
isempty(idxcols) && return copy(ds)
796+
isempty(idxcols) && return ds
699797
col1 = first(idxcols)
700-
lengths = _LENGTH.(_columns(ds)[col1])
798+
if mapformats
799+
f_fmt = getformat(ds, col1)
800+
lengths = _LENGTH.(_columns(ds)[col1], fmt = f_fmt)
801+
else
802+
lengths = _LENGTH.(_columns(ds)[col1])
803+
end
701804
for col in idxcols
702805
v = _columns(ds)[col]
703-
if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
704-
r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
806+
if mapformats
807+
f_fmt = getformat(ds, col)
808+
else
809+
f_fmt = identity
810+
end
811+
if any(x -> _LENGTH(x[1], fmt = f_fmt) != x[2], zip(v, lengths))
812+
r = findfirst(x -> x != 0, _LENGTH.(v, fmt = f_fmt) .- lengths)
705813
colnames = _names(ds)
706814
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
707815
"and :$(colnames[col]) are not the same in row $r"))
@@ -713,9 +821,14 @@ function flatten!(ds::Dataset,
713821
length(idxcols) > 1 && sort!(idxcols)
714822
for col in idxcols
715823
col_to_flatten = _columns(ds)[col]
716-
T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
824+
if mapformats
825+
f_fmt = getformat(ds, col)
826+
else
827+
f_fmt = identity
828+
end
829+
T = mapreduce(x->_ELTYPE(x, fmt = f_fmt), promote_type, col_to_flatten)
717830
_res = allocatecol(T, new_total)
718-
_fill_flatten!(_res, col_to_flatten, lengths)
831+
_fill_flatten!(_res, col_to_flatten, lengths; fmt = f_fmt)
719832
if length(idxcols) == ncol(ds)
720833
_columns(ds)[col] = _res
721834
else
@@ -729,17 +842,27 @@ end
729842

730843

731844
function flatten(ds::AbstractDataset,
732-
cols::Union{ColumnIndex, MultiColumnIndex})
845+
cols::Union{ColumnIndex, MultiColumnIndex}; mapformats = false)
733846
_check_consistency(ds)
734847

735848
idxcols = index(ds)[cols]
736849
isempty(idxcols) && return copy(ds)
737850
col1 = first(idxcols)
738-
lengths = _LENGTH.(_columns(ds)[col1])
851+
if mapformats
852+
f_fmt = getformat(ds, col1)
853+
lengths = _LENGTH.(_columns(ds)[col1], fmt = f_fmt)
854+
else
855+
lengths = _LENGTH.(_columns(ds)[col1])
856+
end
739857
for col in idxcols
740858
v = _columns(ds)[col]
741-
if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
742-
r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
859+
if mapformats
860+
f_fmt = getformat(ds, col)
861+
else
862+
f_fmt = identity
863+
end
864+
if any(x -> _LENGTH(x[1], fmt = f_fmt) != x[2], zip(v, lengths))
865+
r = findfirst(x -> x != 0, _LENGTH.(v, fmt = f_fmt) .- lengths)
743866
colnames = _names(ds)
744867
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
745868
"and :$(colnames[col]) are not the same in row $r"))
@@ -753,34 +876,41 @@ function flatten(ds::AbstractDataset,
753876
length(idxcols) > 1 && sort!(idxcols)
754877
for col in idxcols
755878
col_to_flatten = _columns(ds)[col]
756-
T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
879+
if mapformats
880+
f_fmt = getformat(ds, col)
881+
else
882+
f_fmt = identity
883+
end
884+
T = mapreduce(x->_ELTYPE(x, fmt = f_fmt), promote_type, col_to_flatten)
757885
_res = allocatecol(T, new_total)
758-
_fill_flatten!(_res, col_to_flatten, lengths)
759-
insertcols!(new_ds, col, _names(ds)[col] => _res)
886+
_fill_flatten!(_res, col_to_flatten, lengths; fmt = f_fmt)
887+
insertcols!(new_ds, col, _names(ds)[col] => _res, unsupported_copy_cols = false)
888+
end
889+
for j in setdiff(1:ncol(ds), idxcols)
890+
setformat!(new_ds, j=>getformat(ds, j))
760891
end
761-
setformat!(new_ds, copy(index(ds).format))
762892
setinfo!(new_ds, _attributes(ds).meta.info[])
763893
_reset_grouping_info!(new_ds)
764894
new_ds
765895
end
766896

767897

768-
function _fill_flatten!_barrier(_res, val, counter)
769-
for j in val
898+
function _fill_flatten!_barrier(_res, val, counter; fmt = identity)
899+
for j in fmt(val)
770900
_res[counter] = j
771901
counter += 1
772902
end
773903
counter
774904
end
775905

776-
function _fill_flatten!(_res, col_to_flatten, lengths)
906+
function _fill_flatten!(_res, col_to_flatten, lengths; fmt = identity)
777907
counter = 1
778908
for i in 1:length(col_to_flatten)
779-
if ismissing(col_to_flatten[i])
909+
if ismissing(fmt(col_to_flatten[i]))
780910
_res[counter] = missing
781911
counter += 1
782912
else
783-
counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter)
913+
counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter; fmt = fmt)
784914
end
785915
end
786916
end

test/transpose.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,16 @@ end
579579
@test ds_flat_cat == ref_cat
580580
flatten!(ds_cat, :b)
581581
@test ds_cat == ref_cat
582+
583+
ds = Dataset(x=1:4, y=["ab,bc","d","ef,gh",missing])
584+
fmt(x) = split(x, ",")
585+
fmt(::Missing) = missing
586+
setformat!(ds, 2 => fmt)
587+
@test flatten(ds, :y) == Dataset([[1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4], Union{Missing, Char}['a', 'b', ',', 'b', 'c', 'd', 'e', 'f', ',', 'g', 'h', missing]], [:x, :y])
588+
@test flatten(ds, :y, mapformats = true) == Dataset([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]], [:x, :y])
589+
@test flatten(view(ds, :, [2,1]), :y, mapformats = true) == Dataset(reverse([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]]), reverse([:x, :y]))
590+
flatten!(ds, :y, mapformats = true)
591+
@test ds == Dataset([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]], [:x, :y])
582592
end
583593

584594
@testset "transpose - views" begin

0 commit comments

Comments
 (0)