Skip to content

Commit 53788e6

Browse files
committed
Revert "support mapformats in flatten/!"
This reverts commit 56e3210.
1 parent 56e3210 commit 53788e6

File tree

3 files changed

+26
-169
lines changed

3 files changed

+26
-169
lines changed

CHANGELOG.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,12 @@
55
* Users now can choose between having the observations ids for the left data set and/or the right data set as part of the output data set.
66
* Add a new function `eachgroup`. It allows iteration over each group of a grouped data set.
77
* `op` is a new keyword argument for the `update/!` functions which allows passing a user defined function to control how the value of the main data set should be updated by the values from the transaction data set. ([issue #55](https://github.com/sl-solution/InMemoryDatasets.jl/issues/55))
8-
* Supporting of the `mapformats` keyword argument in `flatten/!`. Now users can flatten a data set based on the formatted values.
98

109
## Fixes
1110

1211
* The `combine` function will now work fine when a view of data set is passed
1312
* For the join functions the `makeunique` argument is now passed correctly to the inside functions.
1413
* `update` and `update!` have the same `mode` option by default.
15-
* Fix the problem with preserving format of `SubDataset` in `flatten/!`
16-
* Fix the problem that caused `flatten!` to produce a copy of data when an empty data set were passed to it.
1714

1815
# Version 0.7.0
1916

src/dataset/transpose.jl

Lines changed: 26 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ Base.transpose(ds::Union{GroupBy, GatherBy}, cols::Tuple; id = nothing, renameco
593593

594594

595595
"""
596-
flatten(ds::AbstractDataset, cols; mapformats = false)
596+
flatten(ds::AbstractDataset, cols)
597597
598598
When columns `cols` of data set `ds` have iterable elements that define
599599
`length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
@@ -605,8 +605,6 @@ same for each `col` in `cols`, or else an error is raised. Note that these
605605
elements are not copied, and thus if they are mutable changing them in the
606606
returned `Dataset` will affect `ds`.
607607
608-
When `mapformats = true`, the function uses the formatted values of `cols`.
609-
610608
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
611609
612610
See [`flatten!`](@ref)
@@ -676,140 +674,34 @@ julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
676674
677675
julia> flatten(ds3, [:b, :c])
678676
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
679-
680-
julia> ds = Dataset(x=1:3, y=["ab,cd", "e", missing], z=[[1,2], 2, 3])
681-
3×3 Dataset
682-
Row │ x y z
683-
│ identity identity identity
684-
│ Int64? String? Any
685-
─────┼──────────────────────────────
686-
1 │ 1 ab,cd [1, 2]
687-
2 │ 2 e 2
688-
3 │ 3 missing 3
689-
690-
julia> fmt(x) = split(x, ",")
691-
fmt (generic function with 2 methods)
692-
693-
julia> fmt(::Missing) = missing
694-
fmt (generic function with 2 methods)
695-
696-
julia> setformat!(ds, :y => fmt)
697-
3×3 Dataset
698-
Row │ x y z
699-
│ identity fmt identity
700-
│ Int64? String? Any
701-
─────┼───────────────────────────────────────────────────
702-
1 │ 1 SubString{String}["ab", "cd"] [1, 2]
703-
2 │ 2 SubString{String}["e"] 2
704-
3 │ 3 missing 3
705-
706-
julia> flatten(ds, :y)
707-
7×3 Dataset
708-
Row │ x y z
709-
│ identity identity identity
710-
│ Int64? Char? Any
711-
─────┼──────────────────────────────
712-
1 │ 1 a [1, 2]
713-
2 │ 1 b [1, 2]
714-
3 │ 1 , [1, 2]
715-
4 │ 1 c [1, 2]
716-
5 │ 1 d [1, 2]
717-
6 │ 2 e 2
718-
7 │ 3 missing 3
719-
720-
julia> flatten(ds, :y, mapformats = true)
721-
4×3 Dataset
722-
Row │ x y z
723-
│ identity identity identity
724-
│ Int64? SubStrin…? Any
725-
─────┼────────────────────────────────
726-
1 │ 1 ab [1, 2]
727-
2 │ 1 cd [1, 2]
728-
3 │ 2 e 2
729-
4 │ 3 missing 3
730-
731-
julia> flatten(ds, 2:3, mapformats = true)
732-
4×3 Dataset
733-
Row │ x y z
734-
│ identity identity identity
735-
│ Int64? SubStrin…? Int64?
736-
─────┼────────────────────────────────
737-
1 │ 1 ab 1
738-
2 │ 1 cd 2
739-
3 │ 2 e 2
740-
4 │ 3 missing 3
741677
```
742678
"""
743679
flatten(ds, cols)
744680

745681
"""
746-
flatten!(ds, cols; mapformats = false)
682+
flatten!(ds, cols)
747683
748684
Variant of `flatten` that does flatten `ds` in-place.
749685
"""
750686
flatten!
751687

752-
function _ELTYPE(x; fmt = identity)
753-
if fmt == identity
754-
eltype(x)
755-
else
756-
eltype(fmt(x))
757-
end
758-
end
759-
function _ELTYPE(x::Missing; fmt = identity)
760-
if fmt == identity
761-
Missing
762-
elseif ismissing(fmt(x))
763-
Missing
764-
else
765-
eltype(fmt(x))
766-
end
767-
end
768-
769-
770-
function _LENGTH(x; fmt = identity)
771-
if fmt == identity
772-
res = length(x)
773-
else
774-
res = length(fmt(x))
775-
end
776-
res
777-
end
778-
779-
function _LENGTH(x::Missing; fmt = identity)
780-
if fmt == identity
781-
res = 1
782-
elseif ismissing(fmt(x))
783-
res = 1
784-
else
785-
res = length(fmt(x))
786-
end
787-
res
788-
end
789-
688+
_ELTYPE(x) = eltype(x)
689+
_ELTYPE(::Missing) = Missing
690+
_LENGTH(x) = length(x)
691+
_LENGTH(::Missing) = 1
790692

791693
function flatten!(ds::Dataset,
792-
cols::Union{ColumnIndex, MultiColumnIndex}; mapformats = false)
694+
cols::Union{ColumnIndex, MultiColumnIndex})
793695
_check_consistency(ds)
794696

795697
idxcols = index(ds)[cols]
796-
isempty(idxcols) && return ds
698+
isempty(idxcols) && return copy(ds)
797699
col1 = first(idxcols)
798-
if mapformats
799-
f_fmt = getformat(ds, col1)
800-
lengths = _LENGTH.(_columns(ds)[col1], fmt = f_fmt)
801-
else
802-
lengths = _LENGTH.(_columns(ds)[col1])
803-
end
700+
lengths = _LENGTH.(_columns(ds)[col1])
804701
for col in idxcols
805702
v = _columns(ds)[col]
806-
if mapformats
807-
f_fmt = getformat(ds, col)
808-
else
809-
f_fmt = identity
810-
end
811-
if any(x -> _LENGTH(x[1], fmt = f_fmt) != x[2], zip(v, lengths))
812-
r = findfirst(x -> x != 0, _LENGTH.(v, fmt = f_fmt) .- lengths)
703+
if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
704+
r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
813705
colnames = _names(ds)
814706
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
815707
"and :$(colnames[col]) are not the same in row $r"))
@@ -821,14 +713,9 @@ function flatten!(ds::Dataset,
821713
length(idxcols) > 1 && sort!(idxcols)
822714
for col in idxcols
823715
col_to_flatten = _columns(ds)[col]
824-
if mapformats
825-
f_fmt = getformat(ds, col)
826-
else
827-
f_fmt = identity
828-
end
829-
T = mapreduce(x->_ELTYPE(x, fmt = f_fmt), promote_type, col_to_flatten)
716+
T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
830717
_res = allocatecol(T, new_total)
831-
_fill_flatten!(_res, col_to_flatten, lengths; fmt = f_fmt)
718+
_fill_flatten!(_res, col_to_flatten, lengths)
832719
if length(idxcols) == ncol(ds)
833720
_columns(ds)[col] = _res
834721
else
@@ -842,27 +729,17 @@ end
842729

843730

844731
function flatten(ds::AbstractDataset,
845-
cols::Union{ColumnIndex, MultiColumnIndex}; mapformats = false)
732+
cols::Union{ColumnIndex, MultiColumnIndex})
846733
_check_consistency(ds)
847734

848735
idxcols = index(ds)[cols]
849736
isempty(idxcols) && return copy(ds)
850737
col1 = first(idxcols)
851-
if mapformats
852-
f_fmt = getformat(ds, col1)
853-
lengths = _LENGTH.(_columns(ds)[col1], fmt = f_fmt)
854-
else
855-
lengths = _LENGTH.(_columns(ds)[col1])
856-
end
738+
lengths = _LENGTH.(_columns(ds)[col1])
857739
for col in idxcols
858740
v = _columns(ds)[col]
859-
if mapformats
860-
f_fmt = getformat(ds, col)
861-
else
862-
f_fmt = identity
863-
end
864-
if any(x -> _LENGTH(x[1], fmt = f_fmt) != x[2], zip(v, lengths))
865-
r = findfirst(x -> x != 0, _LENGTH.(v, fmt = f_fmt) .- lengths)
741+
if any(x -> _LENGTH(x[1]) != x[2], zip(v, lengths))
742+
r = findfirst(x -> x != 0, _LENGTH.(v) .- lengths)
866743
colnames = _names(ds)
867744
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
868745
"and :$(colnames[col]) are not the same in row $r"))
@@ -876,41 +753,34 @@ function flatten(ds::AbstractDataset,
876753
length(idxcols) > 1 && sort!(idxcols)
877754
for col in idxcols
878755
col_to_flatten = _columns(ds)[col]
879-
if mapformats
880-
f_fmt = getformat(ds, col)
881-
else
882-
f_fmt = identity
883-
end
884-
T = mapreduce(x->_ELTYPE(x, fmt = f_fmt), promote_type, col_to_flatten)
756+
T = mapreduce(_ELTYPE, promote_type, col_to_flatten)
885757
_res = allocatecol(T, new_total)
886-
_fill_flatten!(_res, col_to_flatten, lengths; fmt = f_fmt)
887-
insertcols!(new_ds, col, _names(ds)[col] => _res, unsupported_copy_cols = false)
888-
end
889-
for j in setdiff(1:ncol(ds), idxcols)
890-
setformat!(new_ds, j=>getformat(ds, j))
758+
_fill_flatten!(_res, col_to_flatten, lengths)
759+
insertcols!(new_ds, col, _names(ds)[col] => _res)
891760
end
761+
setformat!(new_ds, copy(index(ds).format))
892762
setinfo!(new_ds, _attributes(ds).meta.info[])
893763
_reset_grouping_info!(new_ds)
894764
new_ds
895765
end
896766

897767

898-
function _fill_flatten!_barrier(_res, val, counter; fmt = identity)
899-
for j in fmt(val)
768+
function _fill_flatten!_barrier(_res, val, counter)
769+
for j in val
900770
_res[counter] = j
901771
counter += 1
902772
end
903773
counter
904774
end
905775

906-
function _fill_flatten!(_res, col_to_flatten, lengths; fmt = identity)
776+
function _fill_flatten!(_res, col_to_flatten, lengths)
907777
counter = 1
908778
for i in 1:length(col_to_flatten)
909-
if ismissing(fmt(col_to_flatten[i]))
779+
if ismissing(col_to_flatten[i])
910780
_res[counter] = missing
911781
counter += 1
912782
else
913-
counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter; fmt = fmt)
783+
counter = _fill_flatten!_barrier(_res, col_to_flatten[i], counter)
914784
end
915785
end
916786
end

test/transpose.jl

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -579,16 +579,6 @@ end
579579
@test ds_flat_cat == ref_cat
580580
flatten!(ds_cat, :b)
581581
@test ds_cat == ref_cat
582-
583-
ds = Dataset(x=1:4, y=["ab,bc","d","ef,gh",missing])
584-
fmt(x) = split(x, ",")
585-
fmt(::Missing) = missing
586-
setformat!(ds, 2 => fmt)
587-
@test flatten(ds, :y) == Dataset([[1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4], Union{Missing, Char}['a', 'b', ',', 'b', 'c', 'd', 'e', 'f', ',', 'g', 'h', missing]], [:x, :y])
588-
@test flatten(ds, :y, mapformats = true) == Dataset([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]], [:x, :y])
589-
@test flatten(view(ds, :, [2,1]), :y, mapformats = true) == Dataset(reverse([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]]), reverse([:x, :y]))
590-
flatten!(ds, :y, mapformats = true)
591-
@test ds == Dataset([Union{Missing, Int64}[1, 1, 2, 3, 3, 4], Union{Missing, SubString{String}}["ab", "bc", "d", "ef", "gh", missing]], [:x, :y])
592582
end
593583

594584
@testset "transpose - views" begin

0 commit comments

Comments
 (0)