diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt index fe18afea75..71940b4453 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt @@ -18,6 +18,8 @@ import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate import org.jetbrains.kotlinx.dataframe.documentation.Indent import org.jetbrains.kotlinx.dataframe.documentation.LineBreak +import org.jetbrains.kotlinx.dataframe.documentation.RowFilterDescription +import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns import org.jetbrains.kotlinx.dataframe.impl.columns.TransformableColumnSet import org.jetbrains.kotlinx.dataframe.impl.columns.singleOrNullWithTransformerImpl import org.jetbrains.kotlinx.dataframe.impl.columns.transform @@ -27,18 +29,73 @@ import kotlin.reflect.KProperty // region DataColumn +/** + * Returns the first value in this [DataColumn]. + * + * @param T The type of the values in the [DataColumn]. + * + * @throws [IndexOutOfBoundsException] if the [DataColumn] is empty. + */ public fun DataColumn.first(): T = get(0) +/** + * Returns the first value in this [DataColumn]. If the [DataColumn] is empty, returns `null`. + * + * @param T The type of the values in the [DataColumn]. + */ public fun DataColumn.firstOrNull(): T? = if (size > 0) first() else null +/** + * Returns the first value in this [DataColumn] that matches the given [predicate]. + * + * ### Example + * ```kotlin + * // Select from the column "age" the first value where the age is greater than 17 + * df.age.first { it > 17 } + * ``` + * + * @param T The type of the values in the [DataColumn]. + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * This predicate takes a value from the [DataColumn] as an input + * and returns `true` if the value satisfies the condition or `false` otherwise. + * + * @throws [NoSuchElementException] if the [DataColumn] contains no element matching the [predicate] + * (including the case when the [DataColumn] is empty). + */ public fun DataColumn.first(predicate: (T) -> Boolean): T = values.first(predicate) +/** + * Returns the first value in this [DataColumn] that matches the given [predicate]. + * Returns `null` if the [DataColumn] contains no element matching the [predicate] + * (including the case when the [DataColumn] is empty). + * + * ### Example + * ```kotlin + * // Select from the column "age" the first value where the age is greater than 17, + * // or null if there is no such value + * df.age.firstOrNull { it > 17 } + * ``` + * + * @param T The type of the values in the [DataColumn]. + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * This predicate takes a value from the [DataColumn] as an input + * and returns `true` if the value satisfies the condition or `false` otherwise. + */ public fun DataColumn.firstOrNull(predicate: (T) -> Boolean): T? = values.firstOrNull(predicate) // endregion // region DataFrame +/** + * Returns the first row in this [DataFrame]. + * + * @param T The type of the [DataFrame]. + * + * @throws NoSuchElementException if the [DataFrame] contains no rows. + */ public fun DataFrame.first(): DataRow { if (nrow == 0) { throw NoSuchElementException("DataFrame has no rows. Use `firstOrNull`.") @@ -46,13 +103,67 @@ public fun DataFrame.first(): DataRow { return get(0) } +/** + * Returns the first row in this [DataFrame]. If the [DataFrame] does not contain any rows, returns `null`. + * + * @param T The type of the [DataFrame]. + */ public fun DataFrame.firstOrNull(): DataRow? = if (nrow > 0) first() else null +/** + * Returns the first row in this [DataFrame] that satisfies the given [predicate]. + * + * {@include [RowFilterDescription]} + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * ### Example + * ```kotlin + * // Select the first row where the value in the "age" column is greater than 17 + * // and the "name/firstName" column starts with 'A' + * df.first { age > 17 && name.firstName.startsWith("A") } + * ``` + * + * @param T The type of the [DataFrame]. + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * This predicate takes a value from the [DataFrame] as an input + * and returns `true` if the value satisfies the condition or `false` otherwise. + * + * @return A [DataRow] containing the first row that matches the given [predicate]. + * + * @throws [NoSuchElementException] if the [DataFrame] contains no rows matching the [predicate]. + */ public inline fun DataFrame.first(predicate: RowFilter): DataRow = rows().first { predicate(it, it) } +/** + * Returns the first row in this [DataFrame] that satisfies the given [predicate]. + * Returns `null` if the [DataFrame] contains no rows matching the [predicate] + * (including the case when the [DataFrame] is empty). + * + * {@include [RowFilterDescription]} + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * ### Example + * ```kotlin + * // Select the first row where the value in the "age" column is greater than 17 + * // and the "name/firstName" column starts with 'A' + * df.firstOrNull { age > 17 && name.firstName.startsWith("A") } + * ``` + * + * @param T The type of the [DataFrame]. + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * This predicate takes a value from the [DataFrame] as an input + * and returns `true` if the value satisfies the condition or `false` otherwise. + * + * @return A [DataRow] containing the first row that matches the given [predicate], + * or `null` if the [DataFrame] contains no rows matching the [predicate] + */ public inline fun DataFrame.firstOrNull(predicate: RowFilter): DataRow? = rows().firstOrNull { predicate(it, it) @@ -62,9 +173,53 @@ public inline fun DataFrame.firstOrNull(predicate: RowFilter): DataRow // region GroupBy +/** + * Selects the first row from each group of the given [GroupBy] + * and returns a [ReducedGroupBy] containing these rows + * (one row per group, each row is the first row in its group). + * + * ### Example + * ```kotlin + * // Select the first employee from each group formed by the job title + * employees.groupBy { jobTitle }.first() + * ``` + * + * @param T The type of the values in the [GroupBy]. + * @param G The type of the groups in the [GroupBy]. + * + * @return A [ReducedGroupBy] containing the first row from each group. + */ @Interpretable("GroupByReducePredicate") public fun GroupBy.first(): ReducedGroupBy = reduce { firstOrNull() } +/** + * Selects from each group of the given [GroupBy] the first row satisfying the given [predicate], + * and returns a [ReducedGroupBy] containing these rows (one row per group, + * each row is the first row in its group that satisfies the [predicate]). + * + * If the group in [GroupBy] contains no matching rows, + * the corresponding row in [ReducedGroupBy] will contain `null` values for all columns in the group. + * + * {@include [RowFilterDescription]} + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * ### Example + * ```kotlin + * // Select the first employee older than 25 from each group formed by the job title + * employees.groupBy { jobTitle }.first { age > 25 } + * ``` + * + * @param T The type of the values in the [GroupBy]. + * @param G The type of the groups in the [GroupBy]. + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * This predicate takes a value from the [GroupBy] as an input + * and returns `true` if the value satisfies the condition or `false` otherwise. + * + * @return A [ReducedGroupBy] containing the first row matching the [predicate] + * (or a row with `null` values, except values in the column with the grouping key), from each group. + */ @Interpretable("GroupByReducePredicate") public fun GroupBy.first(predicate: RowFilter): ReducedGroupBy = reduce { firstOrNull(predicate) } @@ -72,16 +227,155 @@ public fun GroupBy.first(predicate: RowFilter): ReducedGroupBy Pivot.first(): ReducedPivot = reduce { firstOrNull() } +/** + * Reduces this [Pivot] by selecting from each group the first row satisfying the given [predicate]. + * + * Returns a [ReducedPivot] where: + * - each column corresponds to a [pivot] group — if multiple pivot keys were used, + * the result will contain column groups for each pivot key, with columns inside + * corresponding to the values of that key; + * - each value contains the first row from that group that satisfies the [predicate], + * or a row with `null` values if no rows in this group match the [predicate]. + * + * The original [Pivot] column structure is preserved. + * If the [Pivot] was created using multiple or nested keys + * (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]), + * the structure remains unchanged — only the contents of each group + * are replaced with the first row from that group that satisfies the [predicate]. + * + * Equivalent to `reduce { firstOrNull(predicate) }`. + * + * See also: + * - [pivot]; + * - common [reduce][Pivot.reduce]. + * + * {@include [RowFilterDescription]} + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * ### Example + * ```kotlin + * // Select the first row for each city where the population is greater than 100 000. + * df.pivot { city }.first { population > 100000 } + * ``` + * + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * + * @return A [ReducedPivot] containing in each column the first row + * that satisfies the [predicate], from the corresponding group (or a row with `null` values) + */ public fun Pivot.first(predicate: RowFilter): ReducedPivot = reduce { firstOrNull(predicate) } // endregion // region PivotGroupBy +/** + * Reduces this [PivotGroupBy] by selecting the first row from each combined [pivot] + [groupBy] group. + * + * Returns a [ReducedPivotGroupBy] containing the following matrix: + * - one row per [groupBy] key (or keys set); + * - one column group per [pivot] key, where each inner column corresponds to a value of that key; + * - each combination of a [groupBy] key and a [pivot] key contains either the first row of the corresponding + * dataframe formed by this pivot–group pair, or a row with `null` values if this dataframe is empty. + * + * The original [PivotGroupBy] column structure is preserved. + * If the [PivotGroupBy] was created using multiple or nested keys + * (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]), + * the result will contain nested column groups reflecting that key structure, + * with each group containing columns for the values of the corresponding key. + * + * Equivalent to `reduce { firstOrNull() }`. + * + * See also: + * - [pivot], [Pivot.groupBy] and [GroupBy.pivot]; + * - common [reduce][PivotGroupBy.reduce]. + * + * ### Example + * ```kotlin + * // Select the first student from each combination of faculty and enrollment year. + * students.pivot { faculty }.groupBy { enrollmentYear }.first() + * ``` + * + * @return A [ReducedPivotGroupBy] containing in each combination of a [groupBy] key and a [pivot] key either + * the first row of the corresponding dataframe formed by this pivot–group pair, + * or a row with `null` values if this dataframe is empty. + */ public fun PivotGroupBy.first(): ReducedPivotGroupBy = reduce { firstOrNull() } +/** + * Reduces this [PivotGroupBy] by selecting from each combined [pivot] + [groupBy] group + * the first row satisfying the given [predicate]. + * + * Returns a [ReducedPivotGroupBy] containing the following matrix: + * - one row per [groupBy] key (or keys set); + * - one column group per [pivot] key, where each inner column corresponds to a value of that key; + * - each combination of a [groupBy] key and a [pivot] key contains either the first matching the [predicate] row + * of the corresponding dataframe formed by this pivot–group pair, + * or a row with `null` values if this dataframe does not contain any rows matching the [predicate]. + * + * The original [PivotGroupBy] column structure is preserved. + * If the [PivotGroupBy] was created using multiple or nested keys + * (e.g., via [and][PivotDsl.and] or [then][PivotDsl.then]), + * the result will contain nested column groups reflecting that key structure, + * with each group containing columns for the values of the corresponding key. + * + * Equivalent to `reduce { firstOrNull(predicate) }`. + * + * See also: + * - [pivot], [Pivot.groupBy] and [GroupBy.pivot]; + * - common [reduce][PivotGroupBy.reduce]. + * + * {@include [RowFilterDescription]} + * + * @include [SelectingColumns.ColumnGroupsAndNestedColumnsMention] + * + * ### Example + * ```kotlin + * // From each combination of faculty and enrollment year select the first student older than 21. + * students.pivot { faculty }.groupBy { enrollmentYear }.first { age > 21 } + * ``` + * + * @param predicate A lambda expression used to select a value + * that satisfies a condition specified in this expression. + * + * @return A [ReducedPivotGroupBy] containing in each combination of a [groupBy] key and a [pivot] key either + * the first matching the [predicate] row of the corresponding dataframe formed by this pivot–group pair, + * or a row with `null` values if this dataframe does not contain any rows matching the [predicate]. + */ public fun PivotGroupBy.first(predicate: RowFilter): ReducedPivotGroupBy = reduce { firstOrNull(predicate) } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt index c52ceab216..84f9342a4a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/first.kt @@ -1,8 +1,14 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.assertions.throwables.shouldThrow +import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.samples.api.age import org.jetbrains.kotlinx.dataframe.samples.api.firstName +import org.jetbrains.kotlinx.dataframe.samples.api.isHappy +import org.jetbrains.kotlinx.dataframe.samples.api.lastName import org.jetbrains.kotlinx.dataframe.samples.api.name +import org.jetbrains.kotlinx.dataframe.samples.api.weight import org.junit.Test class FirstTests : ColumnsSelectionDslTests() { @@ -36,8 +42,227 @@ class FirstTests : ColumnsSelectionDslTests() { df.select { "name".firstCol { col -> col.any { it == "Alice" } } }, df.select { Person::name.firstCol { col -> col.any { it == "Alice" } } }, df.select { NonDataSchemaPerson::name.firstCol { col -> col.any { it == "Alice" } } }, + df.select { pathOf("name").firstCol() }, df.select { pathOf("name").firstCol { col -> col.any { it == "Alice" } } }, df.select { it["name"].asColumnGroup().firstCol { col -> col.any { it == "Alice" } } }, ).shouldAllBeEqual() } + + @Test + fun `first on DataColumn`() { + df.name.lastName.first() shouldBe "Cooper" + df.age.first { it in 18..<40 } shouldBe 20 + + shouldThrow { + df.drop(df.nrow).isHappy.first() + } + } + + @Test + fun `firstOrNull on DataColumn`() { + df.name.lastName.firstOrNull() shouldBe "Cooper" + df.drop(2).weight.firstOrNull() shouldBe null + df.drop(df.nrow).age.firstOrNull() shouldBe null + + df.age.firstOrNull { it in 21..30 } shouldBe 30 + df.age.firstOrNull { it > 50 } shouldBe null + } + + @Test + fun `first on DataFrame`() { + df.first().name.lastName shouldBe "Cooper" + df.first { !isHappy }.name.lastName shouldBe "Daniels" + + shouldThrow { + df.drop(df.nrow).first() + } + shouldThrow { + df.first { age > 50 } + } + shouldThrow { + df.drop(df.nrow).first { isHappy } + } + } + + @Test + fun `firstOrNull on DataFrame`() { + df.firstOrNull()?.name?.lastName shouldBe "Cooper" + df.drop(df.nrow).firstOrNull() shouldBe null + + df.firstOrNull { !isHappy }?.name?.lastName shouldBe "Daniels" + df.firstOrNull { age > 50 } shouldBe null + df.drop(df.nrow).firstOrNull { isHappy } shouldBe null + } + + @Test + fun `first on GroupBy`() { + val grouped = df.groupBy { isHappy } + val reducedGrouped = grouped.first() + val firstHappy = reducedGrouped.values()[0] + val firstUnhappy = reducedGrouped.values()[1] + + firstHappy shouldBe dataFrameOf( + "isHappy" to columnOf(true), + "name" to columnOf( + "firstName" to columnOf("Alice"), + "lastName" to columnOf("Cooper"), + ), + "age" to columnOf(15), + "city" to columnOf("London"), + "weight" to columnOf(54), + )[0] + + firstUnhappy shouldBe dataFrameOf( + "isHappy" to columnOf(false), + "name" to columnOf( + "firstName" to columnOf("Charlie"), + "lastName" to columnOf("Daniels") + ), + "age" to columnOf(20), + "city" to columnOf("Moscow"), + "weight" to columnOf(null), + )[0] + } + + @Test + fun `first on GroupBy with predicate`() { + val grouped = df.groupBy { isHappy } + val reducedGrouped = grouped.first{ it["age"] as Int > 17 && it["city"] != "Moscow" } + val firstHappy = reducedGrouped.values()[0] + val firstUnhappy = reducedGrouped.values()[1] + + firstHappy shouldBe dataFrameOf( + "isHappy" to columnOf(true), + "name" to columnOf( + "firstName" to columnOf("Bob"), + "lastName" to columnOf("Dylan"), + ), + "age" to columnOf(45), + "city" to columnOf("Dubai"), + "weight" to columnOf(87), + )[0] + + firstUnhappy shouldBe dataFrameOf( + "isHappy" to columnOf(false), + "name" to columnOf( + "firstName" to columnOf("Alice"), + "lastName" to columnOf("Wolf") + ), + "age" to columnOf(20), + "city" to columnOf(null), + "weight" to columnOf(55), + )[0] + } + + @Test + fun `first on Pivot`() { + val pivot = df.pivot { isHappy } + val reducedPivot = pivot.first() + val firstHappy = reducedPivot.values()[0] + val firstUnhappy = reducedPivot.values()[1] + firstHappy shouldBe dataFrameOf( + "name" to columnOf( + "firstName" to columnOf("Alice"), + "lastName" to columnOf("Cooper") + ), + "age" to columnOf(15), + "city" to columnOf("London"), + "weight" to columnOf(54), + )[0] + + firstUnhappy shouldBe dataFrameOf( + "name" to columnOf( + "firstName" to columnOf("Charlie"), + "lastName" to columnOf("Daniels") + ), + "age" to columnOf(20), + "city" to columnOf("Moscow"), + "weight" to columnOf(null), + )[0] + } + + @Test + fun `first on Pivot with predicate`() { + val pivot = df.pivot { isHappy } + val reducedPivotAdults = pivot.first { age > 17 } + val firstHappyAdult = reducedPivotAdults.values()[0] + val firstUnhappyAdult = reducedPivotAdults.values()[1] + + firstHappyAdult shouldBe dataFrameOf( + "name" to columnOf( + "firstName" to columnOf("Bob"), + "lastName" to columnOf("Dylan") + ), + "age" to columnOf(45), + "city" to columnOf("Dubai"), + "weight" to columnOf(87), + )[0] + + firstUnhappyAdult shouldBe dataFrameOf( + "name" to columnOf( + "firstName" to columnOf("Charlie"), + "lastName" to columnOf("Daniels") + ), + "age" to columnOf(20), + "city" to columnOf("Moscow"), + "weight" to columnOf(null), + )[0] + } + + @Test + fun `first on PivotGroupBy`() { + val students = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Alice", "Alice", "Bob", "Bob", "Bob", "Bob"), + "age" to columnOf(15, 15, 20, 20, 15, 15, 20, 20), + "group" to columnOf(1, 2, 1, 2, 1, 2, 1, 2) + ) + val studentsPivotGrouped = students.pivot("age").groupBy("name") + val studentsPivotGroupedReduced = studentsPivotGrouped.first().values() + val expectedDf = dataFrameOf( + "name" to columnOf("Alice", "Bob"), + "age" to columnOf( + "15" to columnOf(1, 1), + "20" to columnOf(1, 1), + ) + ) + studentsPivotGroupedReduced shouldBe expectedDf + } + + @Test + fun `first on PivotGroupBy with predicate`() { + val students = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Alice", "Alice", "Bob", "Bob", "Bob", "Bob"), + "age" to columnOf(15, 15, 20, 20, 15, 15, 20, 20), + "group" to columnOf(1, 2, 1, 2, 1, 2, 1, 2) + ) + val studentsPivotGrouped = students.pivot("age").groupBy("name") + val studentsPivotGroupedReduced = studentsPivotGrouped.first { it["group"] == 2 }.values() + val expected = dataFrameOf( + "name" to columnOf("Alice", "Bob"), + "age" to columnOf( + "15" to columnOf(2, 2), + "20" to columnOf(2, 2), + ) + ) + studentsPivotGroupedReduced shouldBe expected + } + + @Test + fun `first on PivotGroupBy with predicate without match`() { + val students = dataFrameOf( + "name" to columnOf("Alice", "Alice", "Alice", "Alice", "Bob", "Bob", "Bob", "Bob"), + "age" to columnOf(15, 15, 20, 20, 15, 15, 20, 20), + "group" to columnOf(1, 2, 1, 2, 1, 2, 1, 2) + ) + val studentsPivotGrouped = students.pivot("age").groupBy("name") + val studentsPivotGroupedReduced = studentsPivotGrouped.first { it["group"] == 3 }.values() + val expected = dataFrameOf( + "name" to columnOf("Alice", "Bob"), + "age" to columnOf( + "15" to columnOf(null, null), + "20" to columnOf(null, null), + ) + ) + studentsPivotGroupedReduced shouldBe expected + } }