scan simple operator state

Ubuntu · Ubuntu · commit 7e75b4040d42 · 2025-11-21T18:28:06.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -66,28 +66,38 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, properties))
     val stateConf = buildStateStoreConf(sourceOptions.resolvedCpLocation, sourceOptions.batchId)
-    val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
-      sourceOptions)
-
-    // The key state encoder spec should be available for all operators except stream-stream joins
-    val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
-      stateStoreReaderInfo.keyStateEncoderSpecOpt.get
+    if (sourceOptions.readAllColumnFamilies) {
+      // For readAllColumnFamilies mode, we don't need specific metadata
+      val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
+      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
+        None, None, None, None)
     } else {
-      val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
-      NoPrefixKeyStateEncoderSpec(keySchema)
-    }
+      val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
+        sourceOptions)
 
-    new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-      stateStoreReaderInfo.transformWithStateVariableInfoOpt,
-      stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
-      stateStoreReaderInfo.stateSchemaProviderOpt,
-      stateStoreReaderInfo.joinColFamilyOpt)
+      // The key state encoder spec should be available for all operators except stream-stream joins
+      val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
+        stateStoreReaderInfo.keyStateEncoderSpecOpt.get
+      } else {
+        val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
+        NoPrefixKeyStateEncoderSpec(keySchema)
+      }
+      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
+        stateStoreReaderInfo.transformWithStateVariableInfoOpt,
+        stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
+        stateStoreReaderInfo.stateSchemaProviderOpt,
+        stateStoreReaderInfo.joinColFamilyOpt)
+    }
   }
 
   override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, options))
-
+    if (sourceOptions.readAllColumnFamilies) {
+      // For readAllColumnFamilies mode, return the binary schema directly
+      return SchemaUtil.getSourceSchema(
+        sourceOptions, new StructType(), new StructType(), None, None)
+    }
     val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
       sourceOptions)
     val oldSchemaFilePaths = StateDataSource.getOldSchemaFilePaths(sourceOptions, hadoopConf)
@@ -372,6 +382,7 @@ case class StateSourceOptions(
     stateVarName: Option[String],
     readRegisteredTimers: Boolean,
     flattenCollectionTypes: Boolean,
+    readAllColumnFamilies: Boolean,
     startOperatorStateUniqueIds: Option[Array[Array[String]]] = None,
     endOperatorStateUniqueIds: Option[Array[Array[String]]] = None) {
   def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE)
@@ -380,7 +391,8 @@ case class StateSourceOptions(
     var desc = s"StateSourceOptions(checkpointLocation=$resolvedCpLocation, batchId=$batchId, " +
       s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide, " +
       s"stateVarName=${stateVarName.getOrElse("None")}, +" +
-      s"flattenCollectionTypes=$flattenCollectionTypes"
+      s"flattenCollectionTypes=$flattenCollectionTypes" +
+      s"readAllColumnFamilies=$readAllColumnFamilies"
     if (fromSnapshotOptions.isDefined) {
       desc += s", snapshotStartBatchId=${fromSnapshotOptions.get.snapshotStartBatchId}"
       desc += s", snapshotPartitionId=${fromSnapshotOptions.get.snapshotPartitionId}"
@@ -407,6 +419,7 @@ object StateSourceOptions extends DataSourceOptions {
   val STATE_VAR_NAME = newOption("stateVarName")
   val READ_REGISTERED_TIMERS = newOption("readRegisteredTimers")
   val FLATTEN_COLLECTION_TYPES = newOption("flattenCollectionTypes")
+  val READ_ALL_COLUMN_FAMILIES = newOption("readAllColumnFamilies")
 
   object JoinSideValues extends Enumeration {
     type JoinSideValues = Value
@@ -492,6 +505,27 @@ object StateSourceOptions extends DataSourceOptions {
 
     val readChangeFeed = Option(options.get(READ_CHANGE_FEED)).exists(_.toBoolean)
 
+    val readAllColumnFamilies = try {
+      Option(options.get(READ_ALL_COLUMN_FAMILIES))
+        .map(_.toBoolean).getOrElse(false)
+    } catch {
+      case _: IllegalArgumentException =>
+        throw StateDataSourceErrors.invalidOptionValue(READ_ALL_COLUMN_FAMILIES,
+          "Boolean value is expected")
+    }
+
+    if (readAllColumnFamilies && stateVarName.isDefined) {
+      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, STATE_VAR_NAME))
+    }
+
+    if (readAllColumnFamilies && joinSide != JoinSideValues.none) {
+      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, JOIN_SIDE))
+    }
+
+    if (readAllColumnFamilies && readChangeFeed) {
+      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, READ_CHANGE_FEED))
+    }
+
     val changeStartBatchId = Option(options.get(CHANGE_START_BATCH_ID)).map(_.toLong)
     var changeEndBatchId = Option(options.get(CHANGE_END_BATCH_ID)).map(_.toLong)
 
@@ -616,7 +650,7 @@ object StateSourceOptions extends DataSourceOptions {
       resolvedCpLocation, batchId.get, operatorId, storeName, joinSide,
       readChangeFeed, fromSnapshotOptions, readChangeFeedOptions,
       stateVarName, readRegisteredTimers, flattenCollectionTypes,
-      startOperatorStateUniqueIds, endOperatorStateUniqueIds)
+      readAllColumnFamilies, startOperatorStateUniqueIds, endOperatorStateUniqueIds)
   }
 
   private def getLastCommittedBatch(session: SparkSession, checkpointLocation: String): Long = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -20,12 +20,13 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
+import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.execution.streaming.state.RecordType.{getRecordTypeAsString, RecordType}
-import org.apache.spark.sql.types.{NullType, StructField, StructType}
+import org.apache.spark.sql.types.{BinaryType, NullType, StructField, StructType}
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{NextIterator, SerializableConfiguration}
 
@@ -49,7 +50,10 @@ class StatePartitionReaderFactory(
 
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
     val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
-    if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
+    if (stateStoreInputPartition.sourceOptions.readAllColumnFamilies) {
+      new StatePartitionReaderAllColumnFamilies(storeConf, hadoopConf,
+        stateStoreInputPartition, schema)
+    } else if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
       new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
         stateStoreInputPartition, schema, keyStateEncoderSpec, stateVariableInfoOpt,
         stateStoreColFamilySchemaOpt, stateSchemaProviderOpt, joinColFamilyOpt)
@@ -84,13 +88,17 @@ abstract class StatePartitionReaderBase(
   protected val keySchema = {
     if (SchemaUtil.checkVariableType(stateVariableInfoOpt, StateVariableType.MapState)) {
       SchemaUtil.getCompositeKeySchema(schema, partition.sourceOptions)
+    } else if (partition.sourceOptions.readAllColumnFamilies) {
+      new StructType().add("keyBytes", BinaryType, nullable = false)
     } else {
       SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
     }
   }
 
   protected val valueSchema = if (stateVariableInfoOpt.isDefined) {
     schemaForValueRow
+  } else if (partition.sourceOptions.readAllColumnFamilies) {
+    new StructType().add("valueBytes", BinaryType, nullable = false)
   } else {
     SchemaUtil.getSchemaAsDataType(
       schema, "value").asInstanceOf[StructType]
@@ -237,6 +245,85 @@ class StatePartitionReader(
   }
 }
 
+/**
+ * An implementation of [[StatePartitionReaderBase]] for reading all column families
+ * in binary format. This reader returns raw key and value bytes along with column family names.
+ */
+class StatePartitionReaderAllColumnFamilies(
+    storeConf: StateStoreConf,
+    hadoopConf: SerializableConfiguration,
+    partition: StateStoreInputPartition,
+    schema: StructType)
+  extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema,
+    NoPrefixKeyStateEncoderSpec(new StructType()), None, None, None, None) {
+
+  val allStateStoreMetadata = {
+    new StateMetadataPartitionReader(
+      partition.sourceOptions.resolvedCpLocation,
+      new SerializableConfiguration(hadoopConf.value),
+      partition.sourceOptions.batchId).stateMetadata.toArray
+  }
+
+  private lazy val store: ReadStateStore = {
+    assert(getStartStoreUniqueId == getEndStoreUniqueId,
+      "Start and end store unique IDs must be the same when reading all column families")
+    provider.getReadStore(
+      partition.sourceOptions.batchId + 1,
+      getStartStoreUniqueId
+    )
+  }
+
+  val colFamilyNames: Seq[String] = {
+    // todo: Support operator with multiple column family names in next PR
+    Seq[String]()
+  }
+
+  override protected lazy val provider: StateStoreProvider = {
+    val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
+      partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
+    val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
+
+    val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(keySchema)
+    val provider = StateStoreProvider.createAndInit(
+      stateStoreProviderId, keySchema, valueSchema, keyStateEncoderSpec,
+      useColumnFamilies = colFamilyNames.nonEmpty, storeConf, hadoopConf.value, false, None)
+
+    provider
+  }
+
+  override lazy val iter: Iterator[InternalRow] = {
+    // Single store with column families (join v3, transformWithState, or simple operators)
+    require(store.isInstanceOf[SupportsRawBytesRead],
+      s"State store ${store.getClass.getName} does not support raw bytes reading")
+
+    val rawStore = store.asInstanceOf[SupportsRawBytesRead]
+    if (colFamilyNames.isEmpty) {
+      rawStore
+        .rawIterator()
+        .map { case (keyBytes, valueBytes) =>
+          SchemaUtil.unifyStateRowPairAsRawBytes(
+            partition.partition, keyBytes, valueBytes, StateStore.DEFAULT_COL_FAMILY_NAME)
+        }
+    } else {
+      colFamilyNames.iterator.flatMap { colFamilyName =>
+        rawStore
+          .rawIterator(colFamilyName)
+          .map { case (keyBytes, valueBytes) =>
+            SchemaUtil.unifyStateRowPairAsRawBytes(partition.partition,
+              keyBytes,
+              valueBytes,
+              colFamilyName)
+          }
+      }
+    }
+  }
+
+  override def close(): Unit = {
+    store.release()
+    super.close()
+  }
+}
+
 /**
  * An implementation of [[StatePartitionReaderBase]] for the readChangeFeed mode of State Data
  * Source. It reads the change of state over batches of a particular partition.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala
@@ -28,7 +28,8 @@ import org.apache.spark.sql.execution.datasources.v2.state.{StateDataSourceError
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.StateVariableType._
 import org.apache.spark.sql.execution.streaming.state.{ReadStateStore, StateStoreColFamilySchema, UnsafeRowPair}
-import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, LongType, MapType, StringType, StructType}
+import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, IntegerType, LongType, MapType, StringType, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.ArrayImplicits._
 
 object SchemaUtil {
@@ -60,6 +61,12 @@ object SchemaUtil {
         .add("key", keySchema)
         .add("value", valueSchema)
         .add("partition_id", IntegerType)
+    } else if (sourceOptions.readAllColumnFamilies) {
+      new StructType()
+        .add("partition_id", IntegerType)
+        .add("key_bytes", BinaryType)
+        .add("value_bytes", BinaryType)
+        .add("column_family_name", StringType)
     } else {
       new StructType()
         .add("key", keySchema)
@@ -76,6 +83,24 @@ object SchemaUtil {
     row
   }
 
+  /**
+   * Creates a unified row from raw key and value bytes.
+   * This is an alias for unifyStateRowPairAsBytes that takes individual byte arrays
+   * instead of a tuple for better readability.
+   */
+  def unifyStateRowPairAsRawBytes(
+     partition: Int,
+     keyBytes: Array[Byte],
+     valueBytes: Array[Byte],
+     colFamilyName: String): InternalRow = {
+    val row = new GenericInternalRow(4)
+    row.update(0, partition)
+    row.update(1, keyBytes)
+    row.update(2, valueBytes)
+    row.update(3, UTF8String.fromString(colFamilyName))
+    row
+  }
+
   def unifyStateRowPairWithMultipleValues(
       pair: (UnsafeRow, GenericArrayData),
       partition: Int): InternalRow = {
@@ -231,7 +256,10 @@ object SchemaUtil {
       "user_map_key" -> classOf[StructType],
       "user_map_value" -> classOf[StructType],
       "expiration_timestamp_ms" -> classOf[LongType],
-      "partition_id" -> classOf[IntegerType])
+      "partition_id" -> classOf[IntegerType],
+      "key_bytes"->classOf[BinaryType],
+      "value_bytes"->classOf[BinaryType],
+      "column_family_name"->classOf[StringType])
 
     val expectedFieldNames = if (transformWithStateVariableInfoOpt.isDefined) {
       val stateVarInfo = transformWithStateVariableInfoOpt.get
@@ -272,6 +300,8 @@ object SchemaUtil {
       }
     } else if (sourceOptions.readChangeFeed) {
       Seq("batch_id", "change_type", "key", "value", "partition_id")
+    } else if (sourceOptions.readAllColumnFamilies) {
+      Seq("partition_id", "key_bytes", "value_bytes", "column_family_name")
     } else {
       Seq("key", "value", "partition_id")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -891,6 +891,20 @@ object StateStoreProvider extends Logging {
   }
 }
 
+/**
+ * Trait for state stores that support reading raw bytes without decoding.
+ * This is useful for copying state data during repartitioning
+ */
+trait SupportsRawBytesRead {
+  /**
+   * Returns an iterator of raw key-value bytes for a column family.
+   * @param colFamilyName the name of the column family to iterate over
+   * @return an iterator of (keyBytes, valueBytes) tuples
+   */
+  def rawIterator(colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME):
+  Iterator[(Array[Byte], Array[Byte])]
+}
+
 /**
  * This is an optional trait to be implemented by [[StateStoreProvider]]s that can read the change
  * of state store over batches. This is used by State Data Source with additional options like
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala