apache
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala‎
Lines changed: 38 additions & 38 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala‎
Lines changed: 38 additions & 38 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala‎
Lines changed: 12 additions & 61 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala‎
Lines changed: 12 additions & 61 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala‎
Lines changed: 16 additions & 11 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/utils/SchemaUtil.scala‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala‎
Lines changed: 2 additions & 19 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala‎
Lines changed: 1 addition & 17 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala‎
Lines changed: 1 addition & 17 deletions
@@ -65,38 +65,35 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, properties))
     val stateConf = buildStateStoreConf(sourceOptions.resolvedCpLocation, sourceOptions.batchId)
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, we don't need specific encoder because it returns raw data
-      val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        None, None, None, None)
-    } else {
-      val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
-        sourceOptions)
+    if (sourceOptions.internalOnlyReadAllColumnFamilies
+      && !stateConf.providerClass.contains("RocksDB")) {
+      throw StateDataSourceErrors.invalidOptionValue(
+        StateSourceOptions.INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES,
+        "internalOnlyReadAllColumnFamilies is only supported with RocksDBStateStoreProvider. " +
+          s"Current provider: ${stateConf.providerClass}")
+    }
+    val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
+      sourceOptions)
 
-      // The key state encoder spec should be available for all operators except stream-stream joins
-      val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
-        stateStoreReaderInfo.keyStateEncoderSpecOpt.get
-      } else {
-        val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
-        NoPrefixKeyStateEncoderSpec(keySchema)
-      }
-      new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
-        stateStoreReaderInfo.transformWithStateVariableInfoOpt,
-        stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
-        stateStoreReaderInfo.stateSchemaProviderOpt,
-        stateStoreReaderInfo.joinColFamilyOpt)
+    // The key state encoder spec should be available for all operators except stream-stream joins
+    val keyStateEncoderSpec = if (stateStoreReaderInfo.keyStateEncoderSpecOpt.isDefined) {
+      stateStoreReaderInfo.keyStateEncoderSpecOpt.get
+    } else {
+      val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
+      NoPrefixKeyStateEncoderSpec(keySchema)
     }
+
+    new StateTable(session, schema, sourceOptions, stateConf, keyStateEncoderSpec,
+      stateStoreReaderInfo.transformWithStateVariableInfoOpt,
+      stateStoreReaderInfo.stateStoreColFamilySchemaOpt,
+      stateStoreReaderInfo.stateSchemaProviderOpt,
+      stateStoreReaderInfo.joinColFamilyOpt)
   }
 
   override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
     val sourceOptions = StateSourceOptions.modifySourceOptions(hadoopConf,
       StateSourceOptions.apply(session, hadoopConf, options))
-    if (sourceOptions.readAllColumnFamilies) {
-      // For readAllColumnFamilies mode, return the binary schema directly
-      return SchemaUtil.getSourceSchema(
-        sourceOptions, new StructType(), new StructType(), None, None)
-    }
+
     val stateStoreReaderInfo: StateStoreReaderInfo = getStoreMetadataAndRunChecks(
       sourceOptions)
     val oldSchemaFilePaths = StateDataSource.getOldSchemaFilePaths(sourceOptions, hadoopConf)
@@ -381,7 +378,7 @@ case class StateSourceOptions(
     stateVarName: Option[String],
     readRegisteredTimers: Boolean,
     flattenCollectionTypes: Boolean,
-    readAllColumnFamilies: Boolean,
+    internalOnlyReadAllColumnFamilies: Boolean,
     startOperatorStateUniqueIds: Option[Array[Array[String]]] = None,
     endOperatorStateUniqueIds: Option[Array[Array[String]]] = None) {
   def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE)
@@ -391,7 +388,7 @@ case class StateSourceOptions(
       s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide, " +
       s"stateVarName=${stateVarName.getOrElse("None")}, +" +
       s"flattenCollectionTypes=$flattenCollectionTypes" +
-      s"readAllColumnFamilies=$readAllColumnFamilies"
+      s"internalOnlyReadAllColumnFamilies=$internalOnlyReadAllColumnFamilies"
     if (fromSnapshotOptions.isDefined) {
       desc += s", snapshotStartBatchId=${fromSnapshotOptions.get.snapshotStartBatchId}"
       desc += s", snapshotPartitionId=${fromSnapshotOptions.get.snapshotPartitionId}"
@@ -418,7 +415,7 @@ object StateSourceOptions extends DataSourceOptions {
   val STATE_VAR_NAME = newOption("stateVarName")
   val READ_REGISTERED_TIMERS = newOption("readRegisteredTimers")
   val FLATTEN_COLLECTION_TYPES = newOption("flattenCollectionTypes")
-  val READ_ALL_COLUMN_FAMILIES = newOption("readAllColumnFamilies")
+  val INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES = newOption("internalOnlyReadAllColumnFamilies")
 
   object JoinSideValues extends Enumeration {
     type JoinSideValues = Value
@@ -503,25 +500,28 @@ object StateSourceOptions extends DataSourceOptions {
 
     val readChangeFeed = Option(options.get(READ_CHANGE_FEED)).exists(_.toBoolean)
 
-    val readAllColumnFamilies = try {
-      Option(options.get(READ_ALL_COLUMN_FAMILIES))
+    val internalOnlyReadAllColumnFamilies = try {
+      Option(options.get(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES))
         .map(_.toBoolean).getOrElse(false)
     } catch {
       case _: IllegalArgumentException =>
-        throw StateDataSourceErrors.invalidOptionValue(READ_ALL_COLUMN_FAMILIES,
+        throw StateDataSourceErrors.invalidOptionValue(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES,
           "Boolean value is expected")
     }
 
-    if (readAllColumnFamilies && stateVarName.isDefined) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, STATE_VAR_NAME))
+    if (internalOnlyReadAllColumnFamilies && stateVarName.isDefined) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, STATE_VAR_NAME))
     }
 
-    if (readAllColumnFamilies && joinSide != JoinSideValues.none) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, JOIN_SIDE))
+    if (internalOnlyReadAllColumnFamilies && joinSide != JoinSideValues.none) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, JOIN_SIDE))
     }
 
-    if (readAllColumnFamilies && readChangeFeed) {
-      throw StateDataSourceErrors.conflictOptions(Seq(READ_ALL_COLUMN_FAMILIES, READ_CHANGE_FEED))
+    if (internalOnlyReadAllColumnFamilies && readChangeFeed) {
+      throw StateDataSourceErrors.conflictOptions(
+        Seq(INTERNAL_ONLY_READ_ALL_COLUMN_FAMILIES, READ_CHANGE_FEED))
     }
 
     val changeStartBatchId = Option(options.get(CHANGE_START_BATCH_ID)).map(_.toLong)
@@ -648,7 +648,7 @@ object StateSourceOptions extends DataSourceOptions {
       resolvedCpLocation, batchId.get, operatorId, storeName, joinSide,
       readChangeFeed, fromSnapshotOptions, readChangeFeedOptions,
       stateVarName, readRegisteredTimers, flattenCollectionTypes,
-      readAllColumnFamilies, startOperatorStateUniqueIds, endOperatorStateUniqueIds)
+      internalOnlyReadAllColumnFamilies, startOperatorStateUniqueIds, endOperatorStateUniqueIds)
   }
 
   private def resolvedCheckpointLocation(
 
@@ -20,7 +20,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow}
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
-import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataPartitionReader
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.operators.stateful.join.SymmetricHashJoinStateManager
 import org.apache.spark.sql.execution.streaming.operators.stateful.transformwithstate.{StateVariableType, TransformWithStateVariableInfo}
@@ -50,9 +49,9 @@ class StatePartitionReaderFactory(
 
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
     val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
-    if (stateStoreInputPartition.sourceOptions.readAllColumnFamilies) {
+    if (stateStoreInputPartition.sourceOptions.internalOnlyReadAllColumnFamilies) {
       new StatePartitionReaderAllColumnFamilies(storeConf, hadoopConf,
-        stateStoreInputPartition, schema)
+        stateStoreInputPartition, schema, keyStateEncoderSpec)
     } else if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
       new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
         stateStoreInputPartition, schema, keyStateEncoderSpec, stateVariableInfoOpt,
@@ -85,15 +84,15 @@ abstract class StatePartitionReaderBase(
   private val schemaForValueRow: StructType =
     StructType(Array(StructField("__dummy__", NullType)))
 
-  protected lazy val keySchema = {
+  protected val keySchema = {
     if (SchemaUtil.checkVariableType(stateVariableInfoOpt, StateVariableType.MapState)) {
       SchemaUtil.getCompositeKeySchema(schema, partition.sourceOptions)
     } else {
       SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
     }
   }
 
-  protected lazy val valueSchema = if (stateVariableInfoOpt.isDefined) {
+  protected val valueSchema = if (stateVariableInfoOpt.isDefined) {
     schemaForValueRow
   } else {
     SchemaUtil.getSchemaAsDataType(
@@ -249,16 +248,10 @@ class StatePartitionReaderAllColumnFamilies(
     storeConf: StateStoreConf,
     hadoopConf: SerializableConfiguration,
     partition: StateStoreInputPartition,
-    schema: StructType)
+    schema: StructType,
+    keyStateEncoderSpec: KeyStateEncoderSpec)
   extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema,
-    NoPrefixKeyStateEncoderSpec(new StructType()), None, None, None, None) {
-
-  val allStateStoreMetadata = {
-    new StateMetadataPartitionReader(
-      partition.sourceOptions.resolvedCpLocation,
-      new SerializableConfiguration(hadoopConf.value),
-      partition.sourceOptions.batchId).stateMetadata.toArray
-  }
+    keyStateEncoderSpec, None, None, None, None) {
 
   private lazy val store: ReadStateStore = {
     assert(getStartStoreUniqueId == getEndStoreUniqueId,
@@ -269,56 +262,14 @@ class StatePartitionReaderAllColumnFamilies(
     )
   }
 
-  val colFamilyNames: Seq[String] = {
-    // todo: Support operator with multiple column family names in next PR
-    Seq[String]()
-  }
-
-  override protected lazy val provider: StateStoreProvider = {
-    val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
-      partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
-    val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
-
-    // Disable format validation when reading raw bytes.
-    // We use binary schemas (keyBytes/valueBytes) which don't match the actual schema
-    // of the stored data. Validation would fail in HDFSBackedStateStoreProvider when
-    // loading data from disk, so we disable it for raw bytes mode.
-    val modifiedStoreConf = storeConf.withFormatValidationDisabled()
-
-    val keyStateEncoderSpec = NoPrefixKeyStateEncoderSpec(new StructType())
-    // Pass in empty keySchema, valueSchema and dummy encoder because we don't encode any data
-    val provider = StateStoreProvider.createAndInit(
-      stateStoreProviderId, new StructType(), new StructType(), keyStateEncoderSpec,
-      useColumnFamilies = colFamilyNames.nonEmpty, modifiedStoreConf, hadoopConf.value, false, None)
-
-    provider
-  }
-
   override lazy val iter: Iterator[InternalRow] = {
     // Single store with column families (join v3, transformWithState, or simple operators)
-    require(store.isInstanceOf[SupportsRawBytesRead],
-      s"State store ${store.getClass.getName} does not support raw bytes reading")
-
-    val rawStore = store.asInstanceOf[SupportsRawBytesRead]
-    if (colFamilyNames.isEmpty) {
-      rawStore
-        .rawIterator()
-        .map { case (keyBytes, valueBytes) =>
-          SchemaUtil.unifyStateRowPairAsRawBytes(
-            partition.partition, keyBytes, valueBytes, StateStore.DEFAULT_COL_FAMILY_NAME)
-        }
-    } else {
-      colFamilyNames.iterator.flatMap { colFamilyName =>
-        rawStore
-          .rawIterator(colFamilyName)
-          .map { case (keyBytes, valueBytes) =>
-            SchemaUtil.unifyStateRowPairAsRawBytes(partition.partition,
-              keyBytes,
-              valueBytes,
-              colFamilyName)
-          }
+    store
+      .iterator()
+      .map { pair =>
+        SchemaUtil.unifyStateRowPairAsRawBytes(
+          (pair.key, pair.value), StateStore.DEFAULT_COL_FAMILY_NAME)
       }
-    }
   }
 
   override def close(): Unit = {
 
@@ -61,12 +61,17 @@ object SchemaUtil {
         .add("key", keySchema)
         .add("value", valueSchema)
         .add("partition_id", IntegerType)
-    } else if (sourceOptions.readAllColumnFamilies) {
+    } else if (sourceOptions.internalOnlyReadAllColumnFamilies) {
       new StructType()
-        .add("partition_id", IntegerType)
+        // todo: change this to some more specific type after we
+        //  can extract partition key from keySchema
+        .add("partition_key", keySchema)
         .add("key_bytes", BinaryType)
         .add("value_bytes", BinaryType)
         .add("column_family_name", StringType)
+        // need key and value schema so that state store can encode data
+        .add("value", valueSchema)
+        .add("key", keySchema)
     } else {
       new StructType()
         .add("key", keySchema)
@@ -89,15 +94,14 @@ object SchemaUtil {
    * instead of a tuple for better readability.
    */
   def unifyStateRowPairAsRawBytes(
-     partition: Int,
-     keyBytes: Array[Byte],
-     valueBytes: Array[Byte],
+     pair: (UnsafeRow, UnsafeRow),
      colFamilyName: String): InternalRow = {
-    val row = new GenericInternalRow(4)
-    row.update(0, partition)
-    row.update(1, keyBytes)
-    row.update(2, valueBytes)
+    val row = new GenericInternalRow(6)
+    row.update(0, pair._1)
+    row.update(1, pair._1.getBytes)
+    row.update(2, pair._2.getBytes)
     row.update(3, UTF8String.fromString(colFamilyName))
+//    row.update(4, pair._2)
     row
   }
 
@@ -257,6 +261,7 @@ object SchemaUtil {
       "user_map_value" -> classOf[StructType],
       "expiration_timestamp_ms" -> classOf[LongType],
       "partition_id" -> classOf[IntegerType],
+      "partition_key" -> classOf[StructType],
       "key_bytes"->classOf[BinaryType],
       "value_bytes"->classOf[BinaryType],
       "column_family_name"->classOf[StringType])
@@ -300,8 +305,8 @@ object SchemaUtil {
       }
     } else if (sourceOptions.readChangeFeed) {
       Seq("batch_id", "change_type", "key", "value", "partition_id")
-    } else if (sourceOptions.readAllColumnFamilies) {
-      Seq("partition_id", "key_bytes", "value_bytes", "column_family_name")
+    } else if (sourceOptions.internalOnlyReadAllColumnFamilies) {
+      Seq("partition_key", "key_bytes", "value_bytes", "column_family_name", "value", "key")
     } else {
       Seq("key", "value", "partition_id")
     }
 
@@ -75,7 +75,7 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
   private val providerName = "HDFSBackedStateStoreProvider"
 
   class HDFSBackedReadStateStore(val version: Long, map: HDFSBackedStateStoreMap)
-    extends ReadStateStore with SupportsRawBytesRead {
+    extends ReadStateStore {
 
     override def id: StateStoreId = HDFSBackedStateStoreProvider.this.stateStoreId
 
@@ -104,22 +104,14 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
     override def valuesIterator(key: UnsafeRow, colFamilyName: String): Iterator[UnsafeRow] = {
       throw StateStoreErrors.unsupportedOperationException("multipleValuesPerKey", "HDFSStateStore")
     }
-
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      // For HDFS, we get UnsafeRows and convert them to bytes
-      // The bytes will be properly aligned since they come from valid UnsafeRows
-      map.iterator().map { pair =>
-        (pair.key.getBytes(), pair.value.getBytes())
-      }
-    }
   }
 
   /** Implementation of [[StateStore]] API which is backed by an HDFS-compatible file system */
   class HDFSBackedStateStore(
       val version: Long,
       private val mapToUpdate: HDFSBackedStateStoreMap,
       shouldForceSnapshot: Boolean = false)
-    extends StateStore with SupportsRawBytesRead {
+    extends StateStore {
 
     /** Trait and classes representing the internal state of the store */
     trait STATE
@@ -247,15 +239,6 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
       new StateStoreIterator(iter)
     }
 
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      assertUseOfDefaultColFamily(colFamilyName)
-      // For HDFS, we get UnsafeRows and convert them to bytes
-      // The bytes will be properly aligned since they come from valid UnsafeRows
-      mapToUpdate.iterator().map { pair =>
-        (pair.key.getBytes(), pair.value.getBytes())
-      }
-    }
-
     override def prefixScan(
         prefixKey: UnsafeRow,
         colFamilyName: String): StateStoreIterator[UnsafeRowPair] = {
 
@@ -47,8 +47,7 @@ private[sql] class RocksDBStateStoreProvider
       lastVersion: Long,
       private[RocksDBStateStoreProvider] val stamp: Long,
       private[RocksDBStateStoreProvider] var readOnly: Boolean,
-      private[RocksDBStateStoreProvider] var forceSnapshotOnCommit: Boolean) extends StateStore
-      with SupportsRawBytesRead {
+      private[RocksDBStateStoreProvider] var forceSnapshotOnCommit: Boolean) extends StateStore {
 
     private sealed trait OPERATION
     private case object UPDATE extends OPERATION
@@ -420,21 +419,6 @@ private[sql] class RocksDBStateStoreProvider
       }
     }
 
-    override def rawIterator(colFamilyName: String): Iterator[(Array[Byte], Array[Byte])] = {
-      validateAndTransitionState(UPDATE)
-      verifyColFamilyOperations("rawIterator", colFamilyName)
-
-      if (useColumnFamilies) {
-        rocksDB.iterator(colFamilyName).map { pair =>
-          (pair.key, pair.value)
-        }
-      } else {
-        rocksDB.iterator().map { pair =>
-          (pair.key, pair.value)
-        }
-      }
-    }
-
     override def prefixScan(
         prefixKey: UnsafeRow,
         colFamilyName: String): StateStoreIterator[UnsafeRowPair] = {