some updates for eachgroup

sl-solution · sl-solution · commit 4d25c23f88ff · 2022-05-15T22:35:02.000+12:00
diff --git a/docs/src/man/grouping.md b/docs/src/man/grouping.md
@@ -336,3 +336,57 @@ Similar to `groupby!/groupby` functions, `gatherby` can be passed to functions w
 As mentioned before, the result of `gatherby` is stable, i.e. the observations order within each group will be the order of their appearance in the original data set. However, when this stability is not needed and there are many groups in the data set, passing `stable = false` improves the performance by sacrificing the stability.
 
 The `gatherby` function has two extra keyword arguments, `isgathered` and `eachrow`, which by default are set to `false`. When the `isgathered` argument is set to `true`, InMemoryDatasets assumes that the observations are currently gathered by some rules and it only finds the starts and ends of each group and marks the data set as gathered. So users can manually group observations by setting this keyword argument. When the `eachrow` argument is set to `true`, InMemoryDatasets does the gathering and then mark each row of the input data set as an individual group. This option is handy for transposing data sets.
+
+## Iterate `eachgroup`
+
+User can use `eachgroup` to iterate each group of a grouped data set. Each element of `eachgroup` is a `SubDataset`.
+
+
+### Examples
+
+```jldoctest
+julia> ds = Dataset(rand(1:10, 10, 3), :auto)
+10×3 Dataset
+ Row │ x1        x2        x3       
+     │ identity  identity  identity 
+     │ Int64?    Int64?    Int64?   
+─────┼──────────────────────────────
+   1 │        7         8        10
+   2 │        4         1         5
+   3 │        7         2         5
+   4 │        4         7         4
+   5 │        5         9         6
+   6 │        9         5         3
+   7 │        9         8         2
+   8 │        7         9         6
+   9 │        2         3         8
+  10 │        1         6         2
+
+julia> i_gds = eachgroup(groupby(ds, 1));
+
+julia> map(nrow, i_gds)
+6-element Vector{Int64}:
+ 1
+ 1
+ 2
+ 1
+ 3
+ 2
+
+julia> i_gds[1]
+1×3 SubDataset
+ Row │ x1        x2        x3       
+     │ identity  identity  identity 
+     │ Int64?    Int64?    Int64?   
+─────┼──────────────────────────────
+   1 │        1         6         2
+
+julia> i_gds[end]
+2×3 SubDataset
+ Row │ x1        x2        x3       
+     │ identity  identity  identity 
+     │ Int64?    Int64?    Int64?   
+─────┼──────────────────────────────
+   1 │        9         5         3
+   2 │        9         8         2
+``` 
diff --git a/src/abstractdataset/iteration.jl b/src/abstractdataset/iteration.jl
@@ -152,28 +152,34 @@ end
 
 
 Base.IndexStyle(::Type{<:GroupedDataset}) = Base.IndexLinear()
-Base.size(itr::GroupedDataset{Dataset}) = (index(itr.ds).ngroups[], )
-Base.size(itr::GroupedDataset{<:Union{GroupBy, GatherBy}}) = (itr.ds.lastvalid, )
-Base.length(itr::GroupedDataset{Dataset}) = index(itr.ds).ngroups[]
-Base.length(itr::GroupedDataset{<:Union{GroupBy, GatherBy}}) = itr.ds.lastvalid
+Base.size(itr::GroupedDataset{Dataset})::Tuple{Int64} = (index(itr.ds).ngroups[], )
+Base.size(itr::GroupedDataset{<:Union{GroupBy, GatherBy}})::Tuple{Int64} = (itr.ds.lastvalid, )
+Base.length(itr::GroupedDataset{Dataset})::Int64 = index(itr.ds).ngroups[]
+Base.length(itr::GroupedDataset{<:Union{GroupBy, GatherBy}})::Int64 = itr.ds.lastvalid
 Base.iterate(itr::GroupedDataset, i::Integer=1) =
     i <= length(itr) ? (itr[i], i + 1) : nothing
-function Base.getindex(itr::GroupedDataset{Dataset}, i::Int)
+function Base.getindex(itr::GroupedDataset{Dataset}, i::Integer)
     i > size(itr)[1] && throw(BoundsError(itr, i))
     st = index(itr.ds).starts
     i == size(itr)[1] ? hi = nrow(itr.ds) : hi = st[i+1]-1
     lo = st[i]
     view(itr.ds, lo:hi, :)
 end
-function Base.getindex(itr::GroupedDataset{<:Union{GroupBy, GatherBy}}, i::Int)
+function Base.getindex(itr::GroupedDataset{<:Union{GroupBy, GatherBy}}, i::Integer)
     i > size(itr)[1] && throw(BoundsError(itr, i))
     st = _group_starts(itr.ds)
     prm = _get_perms(itr.ds)
     i == size(itr)[1] ? hi = nrow(parent(itr.ds)) : hi = st[i+1]-1
     lo = st[i]
     view(parent(itr.ds), view(prm, lo:hi), :)
 end
-
+Base.firstindex(::GroupedDataset) = 1
+Base.lastindex(itr::GroupedDataset) = length(itr)
+Base.eltype(::GroupedDataset) = SubDataset
+Base.keys(itr::GroupedDataset) = LinearIndices(itr)
+Base.pairs(itr::GroupedDataset) = Base.Iterators.Pairs(itr, keys(itr))
+Base.axes(itr::GroupedDataset) = (Base.OneTo(length(itr)), )
+Base.LinearIndices(itr::GroupedDataset) = LinearIndices(axes(itr))
 # Iteration by columns
 
 const DATASETCOLUMNS_DOCSTR = """
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -590,3 +590,27 @@ end
     @test combine(gatherby(sds, 1), r"x"=>var) == combine(groupby(sds, 1), r"x"=>var)
 
 end
+
+@testset "eachgroup iterator" begin
+
+    ds = Dataset(x = [2,1,1,2], y = [1.0, -10.0, 2.0, 0.5])
+    i_gds_1 = eachgroup(groupby(ds, 1))
+    i_gds_2 = eachgroup(gatherby(ds, 1))
+    @test length(i_gds_1) == length(i_gds_2) == 2
+    @test i_gds_1[1] == Dataset(x = [1,1], y = [-10.0, 2.0])
+    @test i_gds_1[1] == i_gds_2[2]
+    @test i_gds_1[end] == Dataset(x = [2,2], y = [1.0, 0.5])
+    @test i_gds_2[begin] == i_gds_1[end]
+
+    sds = view(ds, [2,3,4], [2,1])
+    i_gds = eachgroup(groupby(sds, 2))
+    @test length(i_gds) == 2
+    @test i_gds[Int32(1)] == Dataset(y = [-10.0, 2.0], x = [1,1])
+    @test i_gds[Int8(2)] == i_gds[Int16(2)] == i_gds[2] == Dataset(y = [0.5], x = [2])
+    @test_throws BoundsError i_gds[3]
+    @test_throws BoundsError i_gds[0]
+
+end
+
+
+