From 5068a6d5cbc6d273bd588b08906e24e402637120 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 31 Oct 2025 14:44:23 +0100 Subject: [PATCH 1/2] Made sure aggregating an empty grouped dataframe still generates aggregation columns with test. Issue #1531 --- .../kotlinx/dataframe/impl/GroupByImpl.kt | 29 ++++++++++++--- .../testSets/person/DataFrameTests.kt | 35 +++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt index eab8acff76..a31e524151 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt @@ -8,7 +8,9 @@ import org.jetbrains.kotlinx.dataframe.aggregation.AggregateGroupedBody import org.jetbrains.kotlinx.dataframe.aggregation.NamedValue import org.jetbrains.kotlinx.dataframe.api.GroupBy import org.jetbrains.kotlinx.dataframe.api.GroupedRowFilter +import org.jetbrains.kotlinx.dataframe.api.asFrameColumn import org.jetbrains.kotlinx.dataframe.api.asGroupBy +import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.concat import org.jetbrains.kotlinx.dataframe.api.convert import org.jetbrains.kotlinx.dataframe.api.getColumn @@ -18,6 +20,7 @@ import org.jetbrains.kotlinx.dataframe.api.isColumnGroup import org.jetbrains.kotlinx.dataframe.api.pathOf import org.jetbrains.kotlinx.dataframe.api.remove import org.jetbrains.kotlinx.dataframe.api.rename +import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.columns.FrameColumn import org.jetbrains.kotlinx.dataframe.impl.aggregation.AggregatableInternal import org.jetbrains.kotlinx.dataframe.impl.aggregation.GroupByReceiverImpl @@ -27,8 +30,10 @@ import org.jetbrains.kotlinx.dataframe.impl.api.GroupedDataRowImpl import org.jetbrains.kotlinx.dataframe.impl.api.insertImpl import org.jetbrains.kotlinx.dataframe.impl.api.removeImpl import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrame import org.jetbrains.kotlinx.dataframe.ncol import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.size import org.jetbrains.kotlinx.dataframe.values /** @@ -74,14 +79,25 @@ internal fun aggregateGroupBy( body: AggregateGroupedBody, ): DataFrame { val defaultAggregateName = "aggregated" - + val groupedDfIsEmpty = df.size().nrow == 0 val column = df.getColumn(selector) - val removed = df.removeImpl(columns = selector) - val hasKeyColumns = removed.df.ncol > 0 - val groupedFrame = column.values.map { + val groups = + if (groupedDfIsEmpty) { + // if the grouped dataframe is empty, make sure the provided AggregateGroupedBody is called at least once + // to create aggregated columns. We empty them below. + listOf( + column.asFrameColumn().schema.value + .createEmptyDataFrame() + .cast(), + ) + } else { + column.values + } + + val groupedFrame = groups.map { if (it == null) { null } else { @@ -101,6 +117,11 @@ internal fun aggregateGroupBy( builder.compute() } }.concat() + .let { + // empty the aggregated columns that were created by calling the provided AggregateGroupedBody once + // if the grouped dataframe is empty + if (groupedDfIsEmpty) it.take(0) else it + } val removedNode = removed.removedColumns.single() val insertPath = removedNode.pathFromRoot().dropLast(1) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt index 25f157eb0b..dbe69fd5fc 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt @@ -62,6 +62,7 @@ import org.jetbrains.kotlinx.dataframe.api.fill import org.jetbrains.kotlinx.dataframe.api.fillNulls import org.jetbrains.kotlinx.dataframe.api.filter import org.jetbrains.kotlinx.dataframe.api.first +import org.jetbrains.kotlinx.dataframe.api.firstOrNull import org.jetbrains.kotlinx.dataframe.api.forEach import org.jetbrains.kotlinx.dataframe.api.forEachIndexed import org.jetbrains.kotlinx.dataframe.api.frameColumn @@ -94,14 +95,17 @@ import org.jetbrains.kotlinx.dataframe.api.match import org.jetbrains.kotlinx.dataframe.api.matches import org.jetbrains.kotlinx.dataframe.api.max import org.jetbrains.kotlinx.dataframe.api.maxBy +import org.jetbrains.kotlinx.dataframe.api.maxByOrNull import org.jetbrains.kotlinx.dataframe.api.mean import org.jetbrains.kotlinx.dataframe.api.meanFor import org.jetbrains.kotlinx.dataframe.api.meanOf import org.jetbrains.kotlinx.dataframe.api.median +import org.jetbrains.kotlinx.dataframe.api.medianOrNull import org.jetbrains.kotlinx.dataframe.api.merge import org.jetbrains.kotlinx.dataframe.api.min import org.jetbrains.kotlinx.dataframe.api.minBy import org.jetbrains.kotlinx.dataframe.api.minOf +import org.jetbrains.kotlinx.dataframe.api.minOrNull import org.jetbrains.kotlinx.dataframe.api.minus import org.jetbrains.kotlinx.dataframe.api.move import org.jetbrains.kotlinx.dataframe.api.moveTo @@ -710,6 +714,37 @@ class DataFrameTests : BaseTest() { res.size() shouldBe 2 } + // Issue #1531 + @Test + fun `groupBy empty df should generate empty aggregation cols`() { + val empty = typed.take(0) + val resDf = empty.groupBy { name }.aggregate { + count() into "n" + count { age > 25 } into "old count" + medianOrNull { age } into "median age" + minOrNull { age } into "min age" + all { weight != null } into "all with weights" + maxByOrNull { age }?.city into "oldest origin" + sortBy { age }.firstOrNull()?.city into "youngest origin" + pivot { city.map { "from $it" } }.count() + age.toList() into "ages" + } + + resDf.columnNames() shouldBe listOf( + "name", + "n", + "old count", + "median age", + "min age", + "all with weights", + "oldest origin", + "youngest origin", + "ages", + ) + + resDf.alsoDebug() + } + @Test fun `groupBy`() { fun AnyFrame.check() { From 7a2de0a2552a620c126953f8305d33d97ce37866 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 31 Oct 2025 14:45:05 +0100 Subject: [PATCH 2/2] Removed removeColumns argument from aggregateGroupBy, as it is always `true` --- .../org/jetbrains/kotlinx/dataframe/api/aggregate.kt | 6 +++++- .../org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt | 7 ++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/aggregate.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/aggregate.kt index 0c592a4316..15ab05678c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/aggregate.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/aggregate.kt @@ -21,4 +21,8 @@ public fun Pivot.aggregate(separate: Boolean = false, body: Selector Grouped.aggregate(body: AggregateGroupedBody): DataFrame = - aggregateGroupBy((this as GroupBy<*, *>).toDataFrame(), { groups.cast() }, removeColumns = true, body).cast() + aggregateGroupBy( + df = (this as GroupBy<*, *>).toDataFrame(), + selector = { groups.cast() }, + body = body, + ).cast() diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt index a31e524151..4d0c99c950 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/GroupByImpl.kt @@ -75,7 +75,6 @@ internal class GroupByImpl( internal fun aggregateGroupBy( df: DataFrame, selector: ColumnSelector?>, - removeColumns: Boolean, body: AggregateGroupedBody, ): DataFrame { val defaultAggregateName = "aggregated" @@ -126,13 +125,11 @@ internal fun aggregateGroupBy( val removedNode = removed.removedColumns.single() val insertPath = removedNode.pathFromRoot().dropLast(1) - if (!removeColumns) removedNode.data.wasRemoved = false - val columnsToInsert = groupedFrame.getColumnsWithPaths { colsAtAnyDepth().filter { !it.isColumnGroup() } }.map { ColumnToInsert(insertPath + it.path, it, removedNode) } - val src = if (removeColumns) removed.df else df - return src.insertImpl(columnsToInsert) + + return removed.df.insertImpl(columnsToInsert) }