@@ -593,7 +593,7 @@ Base.transpose(ds::Union{GroupBy, GatherBy}, cols::Tuple; id = nothing, renameco
593593
594594
595595"""
596- flatten(ds::AbstractDataset, cols)
596+ flatten(ds::AbstractDataset, cols; mapformats = false )
597597
598598When columns `cols` of data set `ds` have iterable elements that define
599599`length` (for example a `Vector` of `Vector`s), return a `Dataset` where each
@@ -605,6 +605,8 @@ same for each `col` in `cols`, or else an error is raised. Note that these
605605elements are not copied, and thus if they are mutable changing them in the
606606returned `Dataset` will affect `ds`.
607607
608+ When `mapformats = true`, the function uses the formatted values of `cols`.
609+
608610`cols` can be any column selector ($COLUMNINDEX_STR ; $MULTICOLUMNINDEX_STR ).
609611
610612See [`flatten!`](@ref)
@@ -674,34 +676,140 @@ julia> ds3 = Dataset(a = [1, 2], b = [[1, 2], [3, 4]], c = [[5, 6], [7]])
674676
675677julia> flatten(ds3, [:b, :c])
676678ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
679+
680+ julia> ds = Dataset(x=1:3, y=["ab,cd", "e", missing], z=[[1,2], 2, 3])
681+ 3×3 Dataset
682+ Row │ x y z
683+ │ identity identity identity
684+ │ Int64? String? Any
685+ ─────┼──────────────────────────────
686+ 1 │ 1 ab,cd [1, 2]
687+ 2 │ 2 e 2
688+ 3 │ 3 missing 3
689+
690+ julia> fmt(x) = split(x, ",")
691+ fmt (generic function with 2 methods)
692+
693+ julia> fmt(::Missing) = missing
694+ fmt (generic function with 2 methods)
695+
696+ julia> setformat!(ds, :y => fmt)
697+ 3×3 Dataset
698+ Row │ x y z
699+ │ identity fmt identity
700+ │ Int64? String? Any
701+ ─────┼───────────────────────────────────────────────────
702+ 1 │ 1 SubString{String}["ab", "cd"] [1, 2]
703+ 2 │ 2 SubString{String}["e"] 2
704+ 3 │ 3 missing 3
705+
706+ julia> flatten(ds, :y)
707+ 7×3 Dataset
708+ Row │ x y z
709+ │ identity identity identity
710+ │ Int64? Char? Any
711+ ─────┼──────────────────────────────
712+ 1 │ 1 a [1, 2]
713+ 2 │ 1 b [1, 2]
714+ 3 │ 1 , [1, 2]
715+ 4 │ 1 c [1, 2]
716+ 5 │ 1 d [1, 2]
717+ 6 │ 2 e 2
718+ 7 │ 3 missing 3
719+
720+ julia> flatten(ds, :y, mapformats = true)
721+ 4×3 Dataset
722+ Row │ x y z
723+ │ identity identity identity
724+ │ Int64? SubStrin…? Any
725+ ─────┼────────────────────────────────
726+ 1 │ 1 ab [1, 2]
727+ 2 │ 1 cd [1, 2]
728+ 3 │ 2 e 2
729+ 4 │ 3 missing 3
730+
731+ julia> flatten(ds, 2:3, mapformats = true)
732+ 4×3 Dataset
733+ Row │ x y z
734+ │ identity identity identity
735+ │ Int64? SubStrin…? Int64?
736+ ─────┼────────────────────────────────
737+ 1 │ 1 ab 1
738+ 2 │ 1 cd 2
739+ 3 │ 2 e 2
740+ 4 │ 3 missing 3
677741```
678742"""
679743flatten (ds, cols)
680744
681745"""
682- flatten!(ds, cols)
746+ flatten!(ds, cols; mapformats = false )
683747
684748Variant of `flatten` that does flatten `ds` in-place.
685749"""
686750flatten!
687751
688- _ELTYPE (x) = eltype (x)
689- _ELTYPE (:: Missing ) = Missing
690- _LENGTH (x) = length (x)
691- _LENGTH (:: Missing ) = 1
752+ function _ELTYPE (x; fmt = identity)
753+ if fmt == identity
754+ eltype (x)
755+ else
756+ eltype (fmt (x))
757+ end
758+ end
759+ function _ELTYPE (x:: Missing ; fmt = identity)
760+ if fmt == identity
761+ Missing
762+ elseif ismissing (fmt (x))
763+ Missing
764+ else
765+ eltype (fmt (x))
766+ end
767+ end
768+
769+
770+ function _LENGTH (x; fmt = identity)
771+ if fmt == identity
772+ res = length (x)
773+ else
774+ res = length (fmt (x))
775+ end
776+ res
777+ end
778+
779+ function _LENGTH (x:: Missing ; fmt = identity)
780+ if fmt == identity
781+ res = 1
782+ elseif ismissing (fmt (x))
783+ res = 1
784+ else
785+ res = length (fmt (x))
786+ end
787+ res
788+ end
789+
692790
693791function flatten! (ds:: Dataset ,
694- cols:: Union{ColumnIndex, MultiColumnIndex} )
792+ cols:: Union{ColumnIndex, MultiColumnIndex} ; mapformats = false )
695793 _check_consistency (ds)
696794
697795 idxcols = index (ds)[cols]
698- isempty (idxcols) && return copy (ds)
796+ isempty (idxcols) && return ds
699797 col1 = first (idxcols)
700- lengths = _LENGTH .(_columns (ds)[col1])
798+ if mapformats
799+ f_fmt = getformat (ds, col1)
800+ lengths = _LENGTH .(_columns (ds)[col1], fmt = f_fmt)
801+ else
802+ lengths = _LENGTH .(_columns (ds)[col1])
803+ end
701804 for col in idxcols
702805 v = _columns (ds)[col]
703- if any (x -> _LENGTH (x[1 ]) != x[2 ], zip (v, lengths))
704- r = findfirst (x -> x != 0 , _LENGTH .(v) .- lengths)
806+ if mapformats
807+ f_fmt = getformat (ds, col)
808+ else
809+ f_fmt = identity
810+ end
811+ if any (x -> _LENGTH (x[1 ], fmt = f_fmt) != x[2 ], zip (v, lengths))
812+ r = findfirst (x -> x != 0 , _LENGTH .(v, fmt = f_fmt) .- lengths)
705813 colnames = _names (ds)
706814 throw (ArgumentError (" Lengths of iterables stored in columns :$(colnames[col1]) " *
707815 " and :$(colnames[col]) are not the same in row $r " ))
@@ -713,9 +821,14 @@ function flatten!(ds::Dataset,
713821 length (idxcols) > 1 && sort! (idxcols)
714822 for col in idxcols
715823 col_to_flatten = _columns (ds)[col]
716- T = mapreduce (_ELTYPE, promote_type, col_to_flatten)
824+ if mapformats
825+ f_fmt = getformat (ds, col)
826+ else
827+ f_fmt = identity
828+ end
829+ T = mapreduce (x-> _ELTYPE (x, fmt = f_fmt), promote_type, col_to_flatten)
717830 _res = allocatecol (T, new_total)
718- _fill_flatten! (_res, col_to_flatten, lengths)
831+ _fill_flatten! (_res, col_to_flatten, lengths; fmt = f_fmt )
719832 if length (idxcols) == ncol (ds)
720833 _columns (ds)[col] = _res
721834 else
@@ -729,17 +842,27 @@ end
729842
730843
731844function flatten (ds:: AbstractDataset ,
732- cols:: Union{ColumnIndex, MultiColumnIndex} )
845+ cols:: Union{ColumnIndex, MultiColumnIndex} ; mapformats = false )
733846 _check_consistency (ds)
734847
735848 idxcols = index (ds)[cols]
736849 isempty (idxcols) && return copy (ds)
737850 col1 = first (idxcols)
738- lengths = _LENGTH .(_columns (ds)[col1])
851+ if mapformats
852+ f_fmt = getformat (ds, col1)
853+ lengths = _LENGTH .(_columns (ds)[col1], fmt = f_fmt)
854+ else
855+ lengths = _LENGTH .(_columns (ds)[col1])
856+ end
739857 for col in idxcols
740858 v = _columns (ds)[col]
741- if any (x -> _LENGTH (x[1 ]) != x[2 ], zip (v, lengths))
742- r = findfirst (x -> x != 0 , _LENGTH .(v) .- lengths)
859+ if mapformats
860+ f_fmt = getformat (ds, col)
861+ else
862+ f_fmt = identity
863+ end
864+ if any (x -> _LENGTH (x[1 ], fmt = f_fmt) != x[2 ], zip (v, lengths))
865+ r = findfirst (x -> x != 0 , _LENGTH .(v, fmt = f_fmt) .- lengths)
743866 colnames = _names (ds)
744867 throw (ArgumentError (" Lengths of iterables stored in columns :$(colnames[col1]) " *
745868 " and :$(colnames[col]) are not the same in row $r " ))
@@ -753,34 +876,41 @@ function flatten(ds::AbstractDataset,
753876 length (idxcols) > 1 && sort! (idxcols)
754877 for col in idxcols
755878 col_to_flatten = _columns (ds)[col]
756- T = mapreduce (_ELTYPE, promote_type, col_to_flatten)
879+ if mapformats
880+ f_fmt = getformat (ds, col)
881+ else
882+ f_fmt = identity
883+ end
884+ T = mapreduce (x-> _ELTYPE (x, fmt = f_fmt), promote_type, col_to_flatten)
757885 _res = allocatecol (T, new_total)
758- _fill_flatten! (_res, col_to_flatten, lengths)
759- insertcols! (new_ds, col, _names (ds)[col] => _res)
886+ _fill_flatten! (_res, col_to_flatten, lengths; fmt = f_fmt)
887+ insertcols! (new_ds, col, _names (ds)[col] => _res, unsupported_copy_cols = false )
888+ end
889+ for j in setdiff (1 : ncol (ds), idxcols)
890+ setformat! (new_ds, j=> getformat (ds, j))
760891 end
761- setformat! (new_ds, copy (index (ds). format))
762892 setinfo! (new_ds, _attributes (ds). meta. info[])
763893 _reset_grouping_info! (new_ds)
764894 new_ds
765895end
766896
767897
768- function _fill_flatten!_barrier (_res, val, counter)
769- for j in val
898+ function _fill_flatten!_barrier (_res, val, counter; fmt = identity )
899+ for j in fmt ( val)
770900 _res[counter] = j
771901 counter += 1
772902 end
773903 counter
774904end
775905
776- function _fill_flatten! (_res, col_to_flatten, lengths)
906+ function _fill_flatten! (_res, col_to_flatten, lengths; fmt = identity )
777907 counter = 1
778908 for i in 1 : length (col_to_flatten)
779- if ismissing (col_to_flatten[i])
909+ if ismissing (fmt ( col_to_flatten[i]) )
780910 _res[counter] = missing
781911 counter += 1
782912 else
783- counter = _fill_flatten!_barrier (_res, col_to_flatten[i], counter)
913+ counter = _fill_flatten!_barrier (_res, col_to_flatten[i], counter; fmt = fmt )
784914 end
785915 end
786916end
0 commit comments