address comment

Ubuntu · Ubuntu · commit 42540b112d9d · 2025-11-19T01:35:02.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -65,38 +65,27 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, properties))
     val stateConf = buildStateStoreConf(sourceOptions.resolvedCpLocation, sourceOptions.batchId)
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, we don't need specific encoder because it returns raw data
-      val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        None, None, None, None)
-    } else {
-      val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
-        sourceOptions)
+    val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
+      sourceOptions)
 
-      // The key state encoder spec should be available for all operators except stream-stream joins
-      val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
-        stateStoreReaderInfo.keyStateEncoderSpecOpt.get
-      } else {
-        val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
-        NoPrefixKeyStateEncoderSpec(keySchema)
-      }
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        stateStoreReaderInfo.transformWithStateVariableInfoOpt,
-        stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
-        stateStoreReaderInfo.stateSchemaProviderOpt,
-        stateStoreReaderInfo.joinColFamilyOpt)
+    // The key state encoder spec should be available for all operators except stream-stream joins
+    val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
+      stateStoreReaderInfo.keyStateEncoderSpecOpt.get
+    } else {
+      val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
+      NoPrefixKeyStateEncoderSpec(keySchema)
     }
+
+    new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
+      stateStoreReaderInfo.transformWithStateVariableInfoOpt,
+      stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
+      stateStoreReaderInfo.stateSchemaProviderOpt,
+      stateStoreReaderInfo.joinColFamilyOpt)
   }
 
   override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, options))
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, return the binary schema directly
-      return SchemaUtil.getSourceSchema(
-        sourceOptions, new StructType(), new StructType(), None, None)
-    }
     val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
       sourceOptions)
     val oldSchemaFilePaths = StateDataSource.getOldSchemaFilePaths(sourceOptions, hadoopConf)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -20,7 +20,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
-import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
@@ -52,7 +51,7 @@ class StatePartitionReaderFactory(
     val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
     if (stateStoreInputPartition.sourceOptions.readAllColumnFamilies) {
       new StatePartitionReaderAllColumnFamilies(storeConf, hadoopConf,
-        stateStoreInputPartition, schema)
+        stateStoreInputPartition, schema, keyStateEncoderSpec)
     } else if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
       new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
         stateStoreInputPartition, schema, keyStateEncoderSpec, stateVariableInfoOpt,
@@ -88,12 +87,14 @@ abstract class StatePartitionReaderBase(
   protected lazy val keySchema = {
     if (SchemaUtil.checkVariableType(stateVariableInfoOpt, StateVariableType.MapState)) {
       SchemaUtil.getCompositeKeySchema(schema, partition.sourceOptions)
+    } else if (partition.sourceOptions.readAllColumnFamilies) {
+      SchemaUtil.getSchemaAsDataType(schema, "partition_key").asInstanceOf[StructType]
     } else {
       SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
     }
   }
 
-  protected lazy val valueSchema = if (stateVariableInfoOpt.isDefined) {
+  protected val valueSchema = if (stateVariableInfoOpt.isDefined) {
     schemaForValueRow
   } else {
     SchemaUtil.getSchemaAsDataType(
@@ -249,16 +250,10 @@ class StatePartitionReaderAllColumnFamilies(
     storeConf: StateStoreConf,
     hadoopConf: SerializableConfiguration,
     partition: StateStoreInputPartition,
-    schema: StructType)
+    schema: StructType,
+    keyStateEncoderSpec: KeyStateEncoderSpec)
   extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema,
-    NoPrefixKeyStateEncoderSpec(new StructType()), None, None, None, None) {
-
-  val allStateStoreMetadata = {
-    new StateMetadataPartitionReader(
-      partition.sourceOptions.resolvedCpLocation,
-      new SerializableConfiguration(hadoopConf.value),
-      partition.sourceOptions.batchId).stateMetadata.toArray
-  }
+    keyStateEncoderSpec, None, None, None, None) {
 
   private lazy val store: ReadStateStore = {
     assert(getStartStoreUniqueId == getEndStoreUniqueId,
@@ -269,61 +264,14 @@ class StatePartitionReaderAllColumnFamilies(
     )
   }
 
-  val colFamilyNames: Seq[String] = {
-    // todo: Support operator with multiple column family names in next PR
-    Seq[String]()
-  }
-
-  override protected lazy val provider: StateStoreProvider = {
-    val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
-      partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
-    val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
-
-    // Disable format validation when reading raw bytes.
-    // We use binary schemas (keyBytes/valueBytes) which don't match the actual schema
-    // of the stored data. Validation would fail in HDFSBackedStateStoreProvider when
-    // loading data from disk, so we disable it for raw bytes mode.
-    val modifiedStoreConf = storeConf.withFormatValidationDisabled()
-
-    val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-    // Pass in empty keySchema, valueSchema and dummy encoder because we don't encode any data
-    val provider = StateStoreProvider.createAndInit(
-      stateStoreProviderId, new StructType(), new StructType(), keyStateEncoderSpec,
-      useColumnFamilies = colFamilyNames.nonEmpty, modifiedStoreConf, hadoopConf.value, false, None)
-
-    provider
-  }
-
   override lazy val iter: Iterator[InternalRow] = {
     // Single store with column families (join v3, transformWithState, or simple operators)
-    require(store.isInstanceOf[SupportsRawBytesRead],
-      s"State store ${store.getClass.getName} does not support raw bytes reading")
-
-    val rawStore = store.asInstanceOf[SupportsRawBytesRead]
-    if (colFamilyNames.isEmpty) {
-      rawStore
-        .rawIterator()
-        .map { case (keyBytes, valueBytes) =>
-          SchemaUtil.unifyStateRowPairAsRawBytes(
-            partition.partition, keyBytes, valueBytes, StateStore.DEFAULT_COL_FAMILY_NAME)
-        }
-    } else {
-      colFamilyNames.iterator.flatMap { colFamilyName =>
-        rawStore
-          .rawIterator(colFamilyName)
-          .map { case (keyBytes, valueBytes) =>
-            SchemaUtil.unifyStateRowPairAsRawBytes(partition.partition,
-              keyBytes,
-              valueBytes,
-              colFamilyName)
-          }
+    store
+      .iterator()
+      .map { pair =>
+        SchemaUtil.unifyStateRowPairAsRawBytes(
+          (pair.key, pair.value), StateStore.DEFAULT_COL_FAMILY_NAME)
       }
-    }
-  }
-
-  override def close(): Unit = {
-    store.release()
-    super.close()
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala
@@ -63,10 +63,11 @@ object SchemaUtil {
         .add("partition_id", IntegerType)
     } else if (sourceOptions.readAllColumnFamilies) {
       new StructType()
-        .add("partition_id", IntegerType)
+        .add("partition_key", keySchema)
         .add("key_bytes", BinaryType)
         .add("value_bytes", BinaryType)
         .add("column_family_name", StringType)
+        .add("value", valueSchema)
     } else {
       new StructType()
         .add("key", keySchema)
@@ -89,15 +90,14 @@ object SchemaUtil {
    * instead of a tuple for better readability.
    */
   def unifyStateRowPairAsRawBytes(
-     partition: Int,
-     keyBytes: Array[Byte],
-     valueBytes: Array[Byte],
+     pair: (UnsafeRow, UnsafeRow),
      colFamilyName: String): InternalRow = {
-    val row = new GenericInternalRow(4)
-    row.update(0, partition)
-    row.update(1, keyBytes)
-    row.update(2, valueBytes)
+    val row = new GenericInternalRow(5)
+    row.update(0, pair._1)
+    row.update(1, pair._1.getBytes)
+    row.update(2, pair._2.getBytes)
     row.update(3, UTF8String.fromString(colFamilyName))
+    row.update(4, pair._2)
     row
   }
 
@@ -257,6 +257,7 @@ object SchemaUtil {
       "user_map_value" -> classOf[StructType],
       "expiration_timestamp_ms" -> classOf[LongType],
       "partition_id" -> classOf[IntegerType],
+      "partition_key" -> classOf[StructType],
       "key_bytes"->classOf[BinaryType],
       "value_bytes"->classOf[BinaryType],
       "column_family_name"->classOf[StringType])
@@ -301,7 +302,7 @@ object SchemaUtil {
     } else if (sourceOptions.readChangeFeed) {
       Seq("batch_id", "change_type", "key", "value", "partition_id")
     } else if (sourceOptions.readAllColumnFamilies) {
-      Seq("partition_id", "key_bytes", "value_bytes", "column_family_name")
+      Seq("partition_key", "key_bytes", "value_bytes", "column_family_name", "value")
     } else {
       Seq("key", "value", "partition_id")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -75,7 +75,7 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
   private val providerName = "HDFSBackedStateStoreProvider"
 
   class HDFSBackedReadStateStore(val version: Long, map: HDFSBackedStateStoreMap)
-    extends ReadStateStore with SupportsRawBytesRead {
+    extends ReadStateStore {
 
     override def id: StateStoreId = HDFSBackedStateStoreProvider.this.stateStoreId
 
@@ -104,22 +104,13 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
     override def valuesIterator(key: UnsafeRow, colFamilyName: String): Iterator[UnsafeRow] = {
       throw StateStoreErrors.unsupportedOperationException("multipleValuesPerKey", "HDFSStateStore")
     }
-
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      // For HDFS, we get UnsafeRows and convert them to bytes
-      // The bytes will be properly aligned since they come from valid UnsafeRows
-      map.iterator().map { pair =>
-        (pair.key.getBytes(), pair.value.getBytes())
-      }
-    }
   }
 
   /** Implementation of [[StateStore]] API which is backed by an HDFS-compatible file system */
   class HDFSBackedStateStore(
       val version: Long,
       private val mapToUpdate: HDFSBackedStateStoreMap,
-      shouldForceSnapshot: Boolean = false)
-    extends StateStore with SupportsRawBytesRead {
+      shouldForceSnapshot: Boolean = false) extends StateStore {
 
     /** Trait and classes representing the internal state of the store */
     trait STATE
@@ -247,15 +238,6 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
       new StateStoreIterator(iter)
     }
 
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      assertUseOfDefaultColFamily(colFamilyName)
-      // For HDFS, we get UnsafeRows and convert them to bytes
-      // The bytes will be properly aligned since they come from valid UnsafeRows
-      mapToUpdate.iterator().map { pair =>
-        (pair.key.getBytes(), pair.value.getBytes())
-      }
-    }
-
     override def prefixScan(
         prefixKey: UnsafeRow,
         colFamilyName: String): StateStoreIterator[UnsafeRowPair] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala
@@ -47,8 +47,7 @@ private[sql] class RocksDBStateStoreProvider
       lastVersion: Long,
       private[RocksDBStateStoreProvider] val stamp: Long,
       private[RocksDBStateStoreProvider] var readOnly: Boolean,
-      private[RocksDBStateStoreProvider] var forceSnapshotOnCommit: Boolean) extends StateStore
-      with SupportsRawBytesRead {
+      private[RocksDBStateStoreProvider] var forceSnapshotOnCommit: Boolean) extends StateStore {
 
     private sealed trait OPERATION
     private case object UPDATE extends OPERATION
@@ -420,21 +419,6 @@ private[sql] class RocksDBStateStoreProvider
       }
     }
 
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      validateAndTransitionState(UPDATE)
-      verifyColFamilyOperations("rawIterator", colFamilyName)
-
-      if (useColumnFamilies) {
-        rocksDB.iterator(colFamilyName).map { pair =>
-          (pair.key, pair.value)
-        }
-      } else {
-        rocksDB.iterator().map { pair =>
-          (pair.key, pair.value)
-        }
-      }
-    }
-
     override def prefixScan(
         prefixKey: UnsafeRow,
         colFamilyName: String): StateStoreIterator[UnsafeRowPair] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -891,20 +891,6 @@ object StateStoreProvider extends Logging {
   }
 }
 
-/**
- * Trait for state stores that support reading raw bytes without decoding.
- * This is useful for copying state data during repartitioning
- */
-trait SupportsRawBytesRead {
-  /**
-   * Returns an iterator of raw key-value bytes for a column family.
-   * @param colFamilyName the name of the column family to iterate over
-   * @return an iterator of (keyBytes, valueBytes) tuples
-   */
-  def rawIterator(colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME):
-  Iterator[(Array[Byte], Array[Byte])]
-}
-
 /**
  * This is an optional trait to be implemented by [[StateStoreProvider]]s that can read the change
  * of state store over batches. This is used by State Data Source with additional options like
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReaderAllColumnFamiliesSuite.scala