address comment

Ubuntu · Ubuntu · commit 36db0b65096f · 2025-11-21T18:28:10.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -66,38 +66,35 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, properties))
     val stateConf = buildStateStoreConf(sourceOptions.resolvedCpLocation, sourceOptions.batchId)
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, we don't need specific encoder because it returns raw data
-      val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        None, None, None, None)
-    } else {
-      val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
-        sourceOptions)
+    if (sourceOptions.internalOnlyReadAllColumnFamilies
+      && !stateConf.providerClass.contains("RocksDB")) {
+      throw StateDataSourceErrors.invalidOptionValue(
+        StateSourceOptions.INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES,
+        "internalOnlyReadAllColumnFamilies is only supported with RocksDBStateStoreProvider. " +
+          s"Current provider: ${stateConf.providerClass}")
+    }
+    val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
+      sourceOptions)
 
-      // The key state encoder spec should be available for all operators except stream-stream joins
-      val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
-        stateStoreReaderInfo.keyStateEncoderSpecOpt.get
-      } else {
-        val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
-        NoPrefixKeyStateEncoderSpec(keySchema)
-      }
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        stateStoreReaderInfo.transformWithStateVariableInfoOpt,
-        stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
-        stateStoreReaderInfo.stateSchemaProviderOpt,
-        stateStoreReaderInfo.joinColFamilyOpt)
+    // The key state encoder spec should be available for all operators except stream-stream joins
+    val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
+      stateStoreReaderInfo.keyStateEncoderSpecOpt.get
+    } else {
+      val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
+      NoPrefixKeyStateEncoderSpec(keySchema)
     }
+
+    new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
+      stateStoreReaderInfo.transformWithStateVariableInfoOpt,
+      stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
+      stateStoreReaderInfo.stateSchemaProviderOpt,
+      stateStoreReaderInfo.joinColFamilyOpt)
   }
 
   override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, options))
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, return the binary schema directly
-      return SchemaUtil.getSourceSchema(
-        sourceOptions, new StructType(), new StructType(), None, None)
-    }
+
     val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
       sourceOptions)
     val oldSchemaFilePaths = StateDataSource.getOldSchemaFilePaths(sourceOptions, hadoopConf)
@@ -382,7 +379,7 @@ case class StateSourceOptions(
     stateVarName: Option[String],
     readRegisteredTimers: Boolean,
     flattenCollectionTypes: Boolean,
-    readAllColumnFamilies: Boolean,
+    internalOnlyReadAllColumnFamilies: Boolean,
     startOperatorStateUniqueIds: Option[Array[Array[String]]] = None,
     endOperatorStateUniqueIds: Option[Array[Array[String]]] = None) {
   def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE)
@@ -392,7 +389,7 @@ case class StateSourceOptions(
       s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide, " +
       s"stateVarName=${stateVarName.getOrElse("None")}, +" +
       s"flattenCollectionTypes=$flattenCollectionTypes" +
-      s"readAllColumnFamilies=$readAllColumnFamilies"
+      s"internalOnlyReadAllColumnFamilies=$internalOnlyReadAllColumnFamilies"
     if (fromSnapshotOptions.isDefined) {
       desc += s", snapshotStartBatchId=${fromSnapshotOptions.get.snapshotStartBatchId}"
       desc += s", snapshotPartitionId=${fromSnapshotOptions.get.snapshotPartitionId}"
@@ -419,7 +416,7 @@ object StateSourceOptions extends DataSourceOptions {
   val STATE_VAR_NAME = newOption("stateVarName")
   val READ_REGISTERED_TIMERS = newOption("readRegisteredTimers")
   val FLATTEN_COLLECTION_TYPES = newOption("flattenCollectionTypes")
-  val READ_ALL_COLUMN_FAMILIES = newOption("readAllColumnFamilies")
+  val INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES = newOption("internalOnlyReadAllColumnFamilies")
 
   object JoinSideValues extends Enumeration {
     type JoinSideValues = Value
@@ -505,25 +502,28 @@ object StateSourceOptions extends DataSourceOptions {
 
     val readChangeFeed = Option(options.get(READ_CHANGE_FEED)).exists(_.toBoolean)
 
-    val readAllColumnFamilies = try {
-      Option(options.get(READ_ALL_COLUMN_FAMILIES))
+    val internalOnlyReadAllColumnFamilies = try {
+      Option(options.get(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES))
         .map(_.toBoolean).getOrElse(false)
     } catch {
       case _: IllegalArgumentException =>
-        throw StateDataSourceErrors.invalidOptionValue(READ_ALL_COLUMN_FAMILIES,
+        throw StateDataSourceErrors.invalidOptionValue(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES,
           "Boolean value is expected")
     }
 
-    if (readAllColumnFamilies && stateVarName.isDefined) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, STATE_VAR_NAME))
+    if (internalOnlyReadAllColumnFamilies && stateVarName.isDefined) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, STATE_VAR_NAME))
     }
 
-    if (readAllColumnFamilies && joinSide != JoinSideValues.none) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, JOIN_SIDE))
+    if (internalOnlyReadAllColumnFamilies && joinSide != JoinSideValues.none) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, JOIN_SIDE))
     }
 
-    if (readAllColumnFamilies && readChangeFeed) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, READ_CHANGE_FEED))
+    if (internalOnlyReadAllColumnFamilies && readChangeFeed) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, READ_CHANGE_FEED))
     }
 
     val changeStartBatchId = Option(options.get(CHANGE_START_BATCH_ID)).map(_.toLong)
@@ -650,7 +650,7 @@ object StateSourceOptions extends DataSourceOptions {
       resolvedCpLocation, batchId.get, operatorId, storeName, joinSide,
       readChangeFeed, fromSnapshotOptions, readChangeFeedOptions,
       stateVarName, readRegisteredTimers, flattenCollectionTypes,
-      readAllColumnFamilies, startOperatorStateUniqueIds, endOperatorStateUniqueIds)
+      internalOnlyReadAllColumnFamilies, startOperatorStateUniqueIds, endOperatorStateUniqueIds)
   }
 
   private def getLastCommittedBatch(session: SparkSession, checkpointLocation: String): Long = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -20,7 +20,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
-import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
@@ -50,9 +49,9 @@ class StatePartitionReaderFactory(
 
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
     val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
-    if (stateStoreInputPartition.sourceOptions.readAllColumnFamilies) {
+    if (stateStoreInputPartition.sourceOptions.internalOnlyReadAllColumnFamilies) {
       new StatePartitionReaderAllColumnFamilies(storeConf, hadoopConf,
-        stateStoreInputPartition, schema)
+        stateStoreInputPartition, schema, keyStateEncoderSpec)
     } else if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
       new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
         stateStoreInputPartition, schema, keyStateEncoderSpec, stateVariableInfoOpt,
@@ -85,15 +84,15 @@ abstract class StatePartitionReaderBase(
   private val schemaForValueRow: StructType =
     StructType(Array(StructField("__dummy__", NullType)))
 
-  protected lazy val keySchema = {
+  protected val keySchema = {
     if (SchemaUtil.checkVariableType(stateVariableInfoOpt, StateVariableType.MapState)) {
       SchemaUtil.getCompositeKeySchema(schema, partition.sourceOptions)
     } else {
       SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
     }
   }
 
-  protected lazy val valueSchema = if (stateVariableInfoOpt.isDefined) {
+  protected val valueSchema = if (stateVariableInfoOpt.isDefined) {
     schemaForValueRow
   } else {
     SchemaUtil.getSchemaAsDataType(
@@ -249,16 +248,10 @@ class StatePartitionReaderAllColumnFamilies(
     storeConf: StateStoreConf,
     hadoopConf: SerializableConfiguration,
     partition: StateStoreInputPartition,
-    schema: StructType)
+    schema: StructType,
+    keyStateEncoderSpec: KeyStateEncoderSpec)
   extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema,
-    NoPrefixKeyStateEncoderSpec(new StructType()), None, None, None, None) {
-
-  val allStateStoreMetadata = {
-    new StateMetadataPartitionReader(
-      partition.sourceOptions.resolvedCpLocation,
-      new SerializableConfiguration(hadoopConf.value),
-      partition.sourceOptions.batchId).stateMetadata.toArray
-  }
+    keyStateEncoderSpec, None, None, None, None) {
 
   private lazy val store: ReadStateStore = {
     assert(getStartStoreUniqueId == getEndStoreUniqueId,
@@ -269,56 +262,14 @@ class StatePartitionReaderAllColumnFamilies(
     )
   }
 
-  val colFamilyNames: Seq[String] = {
-    // todo: Support operator with multiple column family names in next PR
-    Seq[String]()
-  }
-
-  override protected lazy val provider: StateStoreProvider = {
-    val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
-      partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
-    val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
-
-    // Disable format validation when reading raw bytes.
-    // We use binary schemas (keyBytes/valueBytes) which don't match the actual schema
-    // of the stored data. Validation would fail in HDFSBackedStateStoreProvider when
-    // loading data from disk, so we disable it for raw bytes mode.
-    val modifiedStoreConf = storeConf.withFormatValidationDisabled()
-
-    val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-    // Pass in empty keySchema, valueSchema and dummy encoder because we don't encode any data
-    val provider = StateStoreProvider.createAndInit(
-      stateStoreProviderId, new StructType(), new StructType(), keyStateEncoderSpec,
-      useColumnFamilies = colFamilyNames.nonEmpty, modifiedStoreConf, hadoopConf.value, false, None)
-
-    provider
-  }
-
   override lazy val iter: Iterator[InternalRow] = {
     // Single store with column families (join v3, transformWithState, or simple operators)
-    require(store.isInstanceOf[SupportsRawBytesRead],
-      s"State store ${store.getClass.getName} does not support raw bytes reading")
-
-    val rawStore = store.asInstanceOf[SupportsRawBytesRead]
-    if (colFamilyNames.isEmpty) {
-      rawStore
-        .rawIterator()
-        .map { case (keyBytes, valueBytes) =>
-          SchemaUtil.unifyStateRowPairAsRawBytes(
-            partition.partition, keyBytes, valueBytes, StateStore.DEFAULT_COL_FAMILY_NAME)
-        }
-    } else {
-      colFamilyNames.iterator.flatMap { colFamilyName =>
-        rawStore
-          .rawIterator(colFamilyName)
-          .map { case (keyBytes, valueBytes) =>
-            SchemaUtil.unifyStateRowPairAsRawBytes(partition.partition,
-              keyBytes,
-              valueBytes,
-              colFamilyName)
-          }
+    store
+      .iterator()
+      .map { pair =>
+        SchemaUtil.unifyStateRowPairAsRawBytes(
+          (pair.key, pair.value), StateStore.DEFAULT_COL_FAMILY_NAME)
       }
-    }
   }
 
   override def close(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala
@@ -61,12 +61,17 @@ object SchemaUtil {
         .add("key", keySchema)
         .add("value", valueSchema)
         .add("partition_id", IntegerType)
-    } else if (sourceOptions.readAllColumnFamilies) {
+    } else if (sourceOptions.internalOnlyReadAllColumnFamilies) {
       new StructType()
-        .add("partition_id", IntegerType)
+        // todo: change this to some more specific type after we
+        //  can extract partition key from keySchema
+        .add("partition_key", keySchema)
         .add("key_bytes", BinaryType)
         .add("value_bytes", BinaryType)
         .add("column_family_name", StringType)
+        // need key and value schema so that state store can encode data
+        .add("value", valueSchema)
+        .add("key", keySchema)
     } else {
       new StructType()
         .add("key", keySchema)
@@ -89,15 +94,14 @@ object SchemaUtil {
    * instead of a tuple for better readability.
    */
   def unifyStateRowPairAsRawBytes(
-     partition: Int,
-     keyBytes: Array[Byte],
-     valueBytes: Array[Byte],
+     pair: (UnsafeRow, UnsafeRow),
      colFamilyName: String): InternalRow = {
-    val row = new GenericInternalRow(4)
-    row.update(0, partition)
-    row.update(1, keyBytes)
-    row.update(2, valueBytes)
+    val row = new GenericInternalRow(6)
+    row.update(0, pair._1)
+    row.update(1, pair._1.getBytes)
+    row.update(2, pair._2.getBytes)
     row.update(3, UTF8String.fromString(colFamilyName))
+//    row.update(4, pair._2)
     row
   }
 
@@ -257,6 +261,7 @@ object SchemaUtil {
       "user_map_value" -> classOf[StructType],
       "expiration_timestamp_ms" -> classOf[LongType],
       "partition_id" -> classOf[IntegerType],
+      "partition_key" -> classOf[StructType],
       "key_bytes"->classOf[BinaryType],
       "value_bytes"->classOf[BinaryType],
       "column_family_name"->classOf[StringType])
@@ -300,8 +305,8 @@ object SchemaUtil {
       }
     } else if (sourceOptions.readChangeFeed) {
       Seq("batch_id", "change_type", "key", "value", "partition_id")
-    } else if (sourceOptions.readAllColumnFamilies) {
-      Seq("partition_id", "key_bytes", "value_bytes", "column_family_name")
+    } else if (sourceOptions.internalOnlyReadAllColumnFamilies) {
+      Seq("partition_key", "key_bytes", "value_bytes", "column_family_name", "value", "key")
     } else {
       Seq("key", "value", "partition_id")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -891,20 +891,6 @@ object StateStoreProvider extends Logging {
   }
 }
 
-/**
- * Trait for state stores that support reading raw bytes without decoding.
- * This is useful for copying state data during repartitioning
- */
-trait SupportsRawBytesRead {
-  /**
-   * Returns an iterator of raw key-value bytes for a column family.
-   * @param colFamilyName the name of the column family to iterate over
-   * @return an iterator of (keyBytes, valueBytes) tuples
-   */
-  def rawIterator(colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME):
-  Iterator[(Array[Byte], Array[Byte])]
-}
-
 /**
  * This is an optional trait to be implemented by [[StateStoreProvider]]s that can read the change
  * of state store over batches. This is used by State Data Source with additional options like
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
@@ -163,26 +163,6 @@ class StateStoreConf(
    */
   val sqlConfs: Map[String, String] =
     sqlConf.getAllConfs.filter(_._1.startsWith("spark.sql.streaming.stateStore."))
-
-  /**
-   * Creates a copy of this StateStoreConf with format validation disabled.
-   * This is useful when reading raw bytes where the schema used (binary) doesn't match
-   * the actual stored data schema.
-   */
-  def withFormatValidationDisabled(): StateStoreConf = {
-    val reconstructedSqlConf = {
-        // Reconstruct a SQLConf with the all settings preserved because sqlConf is transient
-        val conf = new SQLConf()
-        // Restore all state store related settings
-        sqlConfs.foreach { case (key, value) =>
-          conf.setConfString(key, value)
-        }
-      conf
-    }
-    new StateStoreConf(reconstructedSqlConf, extraOptions) {
-      override val formatValidationEnabled: Boolean = false
-    }
-  }
 }
 
 object StateStoreConf {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala