Skip to content

Commit f1bc489

Browse files
committed
Merge branch 'master' into df-schema-matches
2 parents 8c69c93 + 2e2abad commit f1bc489

File tree

57 files changed

+2983
-1246
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2983
-1246
lines changed

core/api/core.api

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4462,7 +4462,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/TakeKt {
44624462
}
44634463

44644464
public final class org/jetbrains/kotlinx/dataframe/api/ToDataFrameKt {
4465+
public static final fun toDataFrame (Ljava/util/List;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
44654466
public static final fun toDataFrame (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
4467+
public static synthetic fun toDataFrame$default (Ljava/util/List;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
44664468
public static final fun toDataFrameAnyColumn (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
44674469
public static final fun toDataFrameColumnPathAnyNullable (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
44684470
public static final fun toDataFrameColumnPathAnyNullable (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
@@ -6811,11 +6813,15 @@ public final class org/jetbrains/kotlinx/dataframe/schema/ComparisonMode : java/
68116813
}
68126814

68136815
public abstract interface class org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema {
6816+
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;
68146817
public abstract fun compare (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;Lorg/jetbrains/kotlinx/dataframe/schema/ComparisonMode;)Lorg/jetbrains/kotlinx/dataframe/schema/CompareResult;
68156818
public static synthetic fun compare$default (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;Lorg/jetbrains/kotlinx/dataframe/schema/ComparisonMode;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/CompareResult;
68166819
public abstract fun getColumns ()Ljava/util/Map;
68176820
}
68186821

6822+
public final class org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion {
6823+
}
6824+
68196825
public final class org/jetbrains/kotlinx/dataframe/util/DeprecationMessagesKt {
68206826
public static final field DF_READ_EXCEL Ljava/lang/String;
68216827
}

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2678,4 +2678,4 @@ public fun <T, C> Convert<T, List<List<C>>>.toDataFrames(containsColumns: Boolea
26782678
* @return A new [DataColumn] with the values converted to [DataFrame].
26792679
*/
26802680
public fun <T> DataColumn<List<List<T>>>.toDataFrames(containsColumns: Boolean = false): DataColumn<AnyFrame> =
2681-
map { it.toDataFrame(containsColumns) }
2681+
map { it.toDataFrame(containsColumns = containsColumns) }

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/toDataFrame.kt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,49 @@ public fun Map<ColumnPath, Iterable<Any?>>.toDataFrame(): AnyFrame =
256256
}.toDataFrameFromPairs<Unit>()
257257

258258
// endregion
259+
260+
/**
261+
* Converts a list of lists into a [DataFrame].
262+
*
263+
* By default, treats lists as row values. If [header] is not provided, the first inner list becomes a header (column names), and the remaining lists are treated as data.
264+
*
265+
* With [containsColumns] = `true`, interprets each inner list as a column.
266+
* If [header] is not provided, the first element will be used as the column name, and the remaining elements as values.
267+
*
268+
* @param T The type of elements contained in the nested lists.
269+
* @param containsColumns If `true`, treats each nested list as a column.
270+
* Otherwise, each nested list is a row.
271+
* Defaults to `false`.
272+
* @param header overrides extraction of column names from lists - all values are treated as data instead.
273+
* @return A [DataFrame] containing the data from the nested list structure.
274+
* Returns an empty [DataFrame] if the input is empty or invalid.
275+
*/
276+
@Refine
277+
@Interpretable("ValuesListsToDataFrame")
278+
public fun <T> List<List<T>>.toDataFrame(header: List<String>?, containsColumns: Boolean = false): AnyFrame =
279+
when {
280+
containsColumns -> {
281+
mapIndexedNotNull { index, list ->
282+
if (list.isEmpty()) return@mapIndexedNotNull null
283+
val name = header?.get(index) ?: list[0].toString()
284+
val values = if (header == null) list.drop(1) else list
285+
createColumnGuessingType(name, values)
286+
}.toDataFrame()
287+
}
288+
289+
isEmpty() -> DataFrame.Empty
290+
291+
else -> {
292+
val data = if (header == null) drop(1) else this
293+
(header ?: get(0).map { it.toString() }).mapIndexed { colIndex, name ->
294+
val values = data.map { row ->
295+
if (row.size <= colIndex) {
296+
null
297+
} else {
298+
row[colIndex]
299+
}
300+
}
301+
createColumnGuessingType(name, values)
302+
}.toDataFrame()
303+
}
304+
}

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt

Lines changed: 7 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ import org.apache.commons.io.input.BOMInputStream
44
import org.jetbrains.kotlinx.dataframe.AnyFrame
55
import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
7-
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
87
import org.jetbrains.kotlinx.dataframe.util.IS_URL
98
import org.jetbrains.kotlinx.dataframe.util.IS_URL_IMPORT
109
import org.jetbrains.kotlinx.dataframe.util.IS_URL_REPLACE
10+
import org.jetbrains.kotlinx.dataframe.util.LISTS_TO_DATAFRAME_MIGRATION
1111
import java.io.File
1212
import java.io.InputStream
1313
import java.net.HttpURLConnection
@@ -45,48 +45,13 @@ public fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFram
4545
}
4646
}
4747

48-
/**
49-
* Converts a list of lists into a [DataFrame].
50-
*
51-
* By default, treats the first inner list as a header (column names), and the remaining lists as rows.
52-
* If [containsColumns] is `true`, interprets each inner list as a column,
53-
* where the first element is used as the column name, and the remaining elements as values.
54-
*
55-
* @param T The type of elements contained in the nested lists.
56-
* @param containsColumns If `true`, treats each nested list as a column with its first element as the column name.
57-
* Otherwise, the first list is treated as the header.
58-
* Defaults to `false`.
59-
* @return A [DataFrame] containing the data from the nested list structure.
60-
* Returns an empty [DataFrame] if the input is empty or invalid.
61-
*/
48+
@Deprecated(
49+
LISTS_TO_DATAFRAME_MIGRATION,
50+
ReplaceWith("this.toDataFrame(header = null, containsColumns)", "org.jetbrains.kotlinx.dataframe.api.toDataFrame"),
51+
level = DeprecationLevel.WARNING,
52+
)
6253
public fun <T> List<List<T>>.toDataFrame(containsColumns: Boolean = false): AnyFrame =
63-
when {
64-
containsColumns -> {
65-
mapNotNull {
66-
if (it.isEmpty()) return@mapNotNull null
67-
val name = it[0].toString()
68-
val values = it.drop(1)
69-
createColumnGuessingType(name, values)
70-
}.toDataFrame()
71-
}
72-
73-
isEmpty() -> DataFrame.Empty
74-
75-
else -> {
76-
val header = get(0).map { it.toString() }
77-
val data = drop(1)
78-
header.mapIndexed { colIndex, name ->
79-
val values = data.map { row ->
80-
if (row.size <= colIndex) {
81-
null
82-
} else {
83-
row[colIndex]
84-
}
85-
}
86-
createColumnGuessingType(name, values)
87-
}.toDataFrame()
88-
}
89-
}
54+
toDataFrame(header = null, containsColumns)
9055

9156
@Deprecated(
9257
message = IS_URL,

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/KotlinNotebookPluginUtils.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,13 +236,13 @@ public object KotlinNotebookPluginUtils {
236236

237237
is FormattedFrame<*> -> dataframeLike.df
238238

239-
is AnyCol -> dataFrameOf(dataframeLike)
239+
is AnyFrame -> dataframeLike
240240

241241
is AnyRow -> dataframeLike.toDataFrame()
242242

243243
is GroupBy<*, *> -> dataframeLike.toDataFrame()
244244

245-
is AnyFrame -> dataframeLike
245+
is AnyCol -> dataFrameOf(dataframeLike)
246246

247247
is DisableRowsLimitWrapper -> dataframeLike.value
248248

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.jetbrains.kotlinx.dataframe.schema
22

33
public interface DataFrameSchema {
4+
public companion object;
45

56
public val columns: Map<String, ColumnSchema>
67

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,9 @@ internal const val GET_ROWS_RANGE_REPLACE = "df().getRows(indices)"
257257
internal const val GET_ROW_OR_NULL_REPLACE = "df().getRowOrNull(index)"
258258
internal const val COPY_REPLACE = "columns().toDataFrame().cast()"
259259

260+
internal const val LISTS_TO_DATAFRAME_MIGRATION =
261+
"Function moved from io to api package, and a new `header` parameter is introduced. $MESSAGE_1_1"
262+
260263
// endregion
261264

262265
// region keep across releases

core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/toDataFrame.kt

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,4 +700,92 @@ class CreateDataFrameTests {
700700
val df = list.toDataFrame(maxDepth = 2)
701701
df["map"].type() shouldBe typeOf<Map<String, Int>>()
702702
}
703+
704+
@Test
705+
fun `parsing row-major lines into structured dataframe`() {
706+
// I think finding data in such format will be rare, so we need an optional header parameter.
707+
val lines = buildList {
708+
addAll(listOf("stamp", "header", "data"))
709+
repeat(33) { row ->
710+
add("stamp $row")
711+
add("header $row")
712+
add("data $row")
713+
}
714+
}
715+
716+
val df = lines.chunked(3).toDataFrame(header = null)
717+
718+
df.columnNames() shouldBe listOf("stamp", "header", "data")
719+
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
720+
df.rowsCount() shouldBe 33
721+
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
722+
}
723+
724+
@Test
725+
fun `parsing srt lines into structured dataframe`() {
726+
// *.srt subtitle file format
727+
val lines = buildList {
728+
repeat(33) { row ->
729+
add("stamp $row")
730+
add("header $row")
731+
add("data $row")
732+
add("\n")
733+
}
734+
}
735+
736+
val df = lines.chunked(4).map { it.dropLast(1) }.toDataFrame(header = listOf("stamp", "header", "data"))
737+
738+
df.columnNames() shouldBe listOf("stamp", "header", "data")
739+
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
740+
df.rowsCount() shouldBe 33
741+
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
742+
743+
// Different approach. I think the dropLast one is better
744+
lines.chunked(4)
745+
.toDataFrame(header = listOf("stamp", "header", "data", "whitespace"))
746+
.remove("whitespace") shouldBe df
747+
}
748+
749+
@Test
750+
fun `parsing column-major lines into structured dataframe`() {
751+
val lines = buildList {
752+
repeat(4) { col ->
753+
repeat(5) { row ->
754+
add("data$col $row")
755+
}
756+
add("\n")
757+
}
758+
}
759+
760+
val header = List(4) { "col $it" }
761+
val df = lines
762+
.chunked(6)
763+
.map { it.dropLast(1) }
764+
.toDataFrame(header = header, containsColumns = true)
765+
df.columnNames() shouldBe header
766+
df.columnTypes() shouldBe List(4) { typeOf<String>() }
767+
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
768+
}
769+
770+
@Test
771+
fun `parsing column-major lines with header into structured dataframe`() {
772+
val lines = buildList {
773+
repeat(4) { col ->
774+
add("col $col")
775+
repeat(5) { row ->
776+
add("data$col $row")
777+
}
778+
add("\n")
779+
}
780+
}
781+
782+
val header = List(4) { "col $it" }
783+
val df = lines
784+
.chunked(7)
785+
.map { it.dropLast(1) }
786+
.toDataFrame(header = null, containsColumns = true)
787+
df.columnNames() shouldBe header
788+
df.columnTypes() shouldBe List(4) { typeOf<String>() }
789+
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
790+
}
703791
}

core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Create.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,4 +425,22 @@ class Create : TestBase() {
425425
val df = files.toDataFrame(columnName = "data")
426426
// SampleEnd
427427
}
428+
429+
@Test
430+
@TransformDataFrameExpressions
431+
fun toDataFrameLists() {
432+
// SampleStart
433+
val lines = """
434+
1
435+
00:00:05,000 --> 00:00:07,500
436+
This is the first subtitle.
437+
438+
2
439+
00:00:08,000 --> 00:00:10,250
440+
This is the second subtitle.
441+
""".trimIndent().lines()
442+
443+
lines.chunked(4) { it.take(3) }.toDataFrame(header = listOf("n", "timestamp", "text"))
444+
// SampleEnd
445+
}
428446
}

core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Modify.kt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -536,9 +536,7 @@ class Modify : TestBase() {
536536
@TransformDataFrameExpressions
537537
fun splitRegex1() {
538538
// SampleStart
539-
val name by column<String>()
540-
541-
merged.split { name }
539+
merged.split { "name"<String>() }
542540
.match("""(.*) \((.*)\)""")
543541
.inward("firstName", "lastName")
544542
// SampleEnd
@@ -557,11 +555,12 @@ class Modify : TestBase() {
557555
7, 8,
558556
9, 10,
559557
)
560-
val group by columnOf(df1, df2)
561-
val id by columnOf("x", "y")
562-
val df = dataFrameOf(id, group)
558+
val df = dataFrameOf(
559+
"id" to columnOf("x", "y"),
560+
"group" to columnOf(df1, df2)
561+
)
563562

564-
df.split { group }.intoColumns()
563+
df.split { "group"<AnyFrame>() }.intoColumns()
565564
// SampleEnd
566565
}
567566

0 commit comments

Comments
 (0)