Skip to content

Commit f3c62ce

Browse files
committed
Improve list.chunked() + List<List<T>>.toDataFrame use case
1 parent e760bf2 commit f3c62ce

File tree

4 files changed

+110
-16
lines changed

4 files changed

+110
-16
lines changed

core/api/core.api

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6099,8 +6099,8 @@ public final class org/jetbrains/kotlinx/dataframe/io/CommonKt {
60996099
public static final fun isURL (Ljava/lang/String;)Z
61006100
public static final fun isUrl (Ljava/lang/String;)Z
61016101
public static final fun skippingBomCharacters (Ljava/io/InputStream;)Ljava/io/InputStream;
6102-
public static final fun toDataFrame (Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
6103-
public static synthetic fun toDataFrame$default (Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
6102+
public static final fun toDataFrame (Ljava/util/List;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
6103+
public static synthetic fun toDataFrame$default (Ljava/util/List;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
61046104
public static final fun urlAsFile (Ljava/net/URL;)Ljava/io/File;
61056105
}
61066106

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2525,4 +2525,4 @@ public fun <T, C> Convert<T, List<List<C>>>.toDataFrames(containsColumns: Boolea
25252525
* @return A new [DataColumn] with the values converted to [DataFrame].
25262526
*/
25272527
public fun <T> DataColumn<List<List<T>>>.toDataFrames(containsColumns: Boolean = false): DataColumn<AnyFrame> =
2528-
map { it.toDataFrame(containsColumns) }
2528+
map { it.toDataFrame(containsColumns = containsColumns) }

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package org.jetbrains.kotlinx.dataframe.io
33
import org.apache.commons.io.input.BOMInputStream
44
import org.jetbrains.kotlinx.dataframe.AnyFrame
55
import org.jetbrains.kotlinx.dataframe.DataFrame
6+
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
7+
import org.jetbrains.kotlinx.dataframe.annotations.Refine
68
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
79
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
810
import org.jetbrains.kotlinx.dataframe.util.IS_URL
@@ -48,34 +50,37 @@ public fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFram
4850
/**
4951
* Converts a list of lists into a [DataFrame].
5052
*
51-
* By default, treats the first inner list as a header (column names), and the remaining lists as rows.
52-
* If [containsColumns] is `true`, interprets each inner list as a column,
53-
* where the first element is used as the column name, and the remaining elements as values.
53+
* By default, treats lists as rows. If [header] is not provided, the first inner list becomes a header (column names), and the remaining lists are treated as data.
54+
*
55+
* With [containsColumns] = `true`, interprets each inner list as a column.
56+
* If [header] is not provided, the first element will be used as the column name, and the remaining elements as values.
5457
*
5558
* @param T The type of elements contained in the nested lists.
56-
* @param containsColumns If `true`, treats each nested list as a column with its first element as the column name.
57-
* Otherwise, the first list is treated as the header.
59+
* @param containsColumns If `true`, treats each nested list as a column.
60+
* Otherwise, each nested list is a row.
5861
* Defaults to `false`.
62+
* @param header overrides extraction of column names from lists - all values are treated as data instead.
5963
* @return A [DataFrame] containing the data from the nested list structure.
6064
* Returns an empty [DataFrame] if the input is empty or invalid.
6165
*/
62-
public fun <T> List<List<T>>.toDataFrame(containsColumns: Boolean = false): AnyFrame =
66+
@Refine
67+
@Interpretable("ValuesListsToDataFrame")
68+
public fun <T> List<List<T>>.toDataFrame(header: List<String>? = null, containsColumns: Boolean = false): AnyFrame =
6369
when {
6470
containsColumns -> {
65-
mapNotNull {
66-
if (it.isEmpty()) return@mapNotNull null
67-
val name = it[0].toString()
68-
val values = it.drop(1)
71+
mapIndexedNotNull { index, list ->
72+
if (list.isEmpty()) return@mapIndexedNotNull null
73+
val name = header?.get(index) ?: list[0].toString()
74+
val values = if (header == null) list.drop(1) else list
6975
createColumnGuessingType(name, values)
7076
}.toDataFrame()
7177
}
7278

7379
isEmpty() -> DataFrame.Empty
7480

7581
else -> {
76-
val header = get(0).map { it.toString() }
77-
val data = drop(1)
78-
header.mapIndexed { colIndex, name ->
82+
val data = if (header == null) drop(1) else this
83+
(header ?: get(0).map { it.toString() }).mapIndexed { colIndex, name ->
7984
val values = data.map { row ->
8085
if (row.size <= colIndex) {
8186
null

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/toDataFrame.kt

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.DataRow
1616
import org.jetbrains.kotlinx.dataframe.annotations.ColumnName
1717
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
1818
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
19+
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
1920
import org.jetbrains.kotlinx.dataframe.kind
2021
import org.jetbrains.kotlinx.dataframe.type
2122
import org.junit.Test
@@ -700,4 +701,92 @@ class CreateDataFrameTests {
700701
val df = list.toDataFrame(maxDepth = 2)
701702
df["map"].type() shouldBe typeOf<Map<String, Int>>()
702703
}
704+
705+
@Test
706+
fun `parsing row-major lines into structured dataframe`() {
707+
// I think finding data in such format will be rare, so we need an optional header parameter.
708+
val lines = buildList {
709+
addAll(listOf("stamp", "header", "data"))
710+
repeat(33) { row ->
711+
add("stamp $row")
712+
add("header $row")
713+
add("data $row")
714+
}
715+
}
716+
717+
val df = lines.chunked(3).toDataFrame()
718+
719+
df.columnNames() shouldBe listOf("stamp", "header", "data")
720+
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
721+
df.rowsCount() shouldBe 33
722+
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
723+
}
724+
725+
@Test
726+
fun `parsing srt lines into structured dataframe`() {
727+
// *.srt subtitle file format
728+
val lines = buildList {
729+
repeat(33) { row ->
730+
add("stamp $row")
731+
add("header $row")
732+
add("data $row")
733+
add("\n")
734+
}
735+
}
736+
737+
val df = lines.chunked(4).map { it.dropLast(1) }.toDataFrame(header = listOf("stamp", "header", "data"))
738+
739+
df.columnNames() shouldBe listOf("stamp", "header", "data")
740+
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
741+
df.rowsCount() shouldBe 33
742+
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
743+
744+
// Different approach. I think the dropLast one is better
745+
lines.chunked(4)
746+
.toDataFrame(header = listOf("stamp", "header", "data", "whitespace"))
747+
.remove("whitespace") shouldBe df
748+
}
749+
750+
@Test
751+
fun `parsing column-major lines into structured dataframe`() {
752+
val lines = buildList {
753+
repeat(4) { col ->
754+
repeat(5) { row ->
755+
add("data$col $row")
756+
}
757+
add("\n")
758+
}
759+
}
760+
761+
val header = List(4) { "col $it" }
762+
val df = lines
763+
.chunked(6)
764+
.map { it.dropLast(1) }
765+
.toDataFrame(header = header, containsColumns = true)
766+
df.columnNames() shouldBe header
767+
df.columnTypes() shouldBe List(4) { typeOf<String>() }
768+
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
769+
}
770+
771+
@Test
772+
fun `parsing column-major lines with header into structured dataframe`() {
773+
val lines = buildList {
774+
repeat(4) { col ->
775+
add("col $col")
776+
repeat(5) { row ->
777+
add("data$col $row")
778+
}
779+
add("\n")
780+
}
781+
}
782+
783+
val header = List(4) { "col $it" }
784+
val df = lines
785+
.chunked(7)
786+
.map { it.dropLast(1) }
787+
.toDataFrame(containsColumns = true)
788+
df.columnNames() shouldBe header
789+
df.columnTypes() shouldBe List(4) { typeOf<String>() }
790+
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
791+
}
703792
}

0 commit comments

Comments
 (0)