@@ -12,12 +12,34 @@ use crate::expressions::{column_name, ColumnName, Expression, ExpressionRef, Pre
1212use crate :: kernel_predicates:: { DefaultKernelPredicateEvaluator , KernelPredicateEvaluator as _} ;
1313use crate :: log_replay:: { ActionsBatch , FileActionDeduplicator , FileActionKey , LogReplayProcessor } ;
1414use crate :: scan:: Scalar ;
15- use crate :: schema:: ToSchema as _;
15+ use crate :: schema:: { SchemaRef , ToSchema as _} ;
1616use crate :: schema:: { ColumnNamesAndTypes , DataType , MapType , StructField , StructType } ;
17- use crate :: transforms:: { get_transform_expr, parse_partition_values} ;
17+ use crate :: table_features:: ColumnMappingMode ;
18+ use crate :: transforms:: { get_transform_expr, parse_partition_values, TransformSpec } ;
1819use crate :: utils:: require;
1920use crate :: { DeltaResult , Engine , Error , ExpressionEvaluator } ;
2021
22+ /// Internal serializable state (schemas, transform spec, column mapping, etc.)
23+ /// This is opaque to the engine - just passed through as a blob.
24+ #[ derive( serde:: Serialize , serde:: Deserialize , Clone ) ]
25+ struct InternalState {
26+ logical_schema : StructType ,
27+ physical_schema : StructType ,
28+ transform_spec : Option < TransformSpec > ,
29+ column_mapping_mode : ColumnMappingMode ,
30+ }
31+
32+ /// Public-facing serialized processor state for distribution to executors.
33+ /// The engine passes everything needed as Arc references, plus an opaque internal state blob.
34+ #[ derive( Clone ) ]
35+ pub struct SerializedScanState {
36+ /// Optional predicate for data skipping (if provided)
37+ pub predicate : Option < PredicateRef > ,
38+ /// Opaque internal state blob (JSON for now)
39+ pub internal_state_blob : Vec < u8 > ,
40+ }
41+
42+
2143/// [`ScanLogReplayProcessor`] performs log replay (processes actions) specifically for doing a table scan.
2244///
2345/// During a table scan, the processor reads batches of log actions (in reverse chronological order)
@@ -44,11 +66,11 @@ pub(crate) struct ScanLogReplayProcessor {
4466 partition_filter : Option < PredicateRef > ,
4567 data_skipping_filter : Option < DataSkippingFilter > ,
4668 add_transform : Arc < dyn ExpressionEvaluator > ,
47- state_info : Arc < StateInfo > ,
69+ pub ( crate ) state_info : Arc < StateInfo > ,
4870 /// A set of (data file path, dv_unique_id) pairs that have been seen thus
4971 /// far in the log. This is used to filter out files with Remove actions as
5072 /// well as duplicate entries in the log.
51- seen_file_keys : HashSet < FileActionKey > ,
73+ pub ( crate ) seen_file_keys : HashSet < FileActionKey > ,
5274}
5375
5476impl ScanLogReplayProcessor {
@@ -84,6 +106,91 @@ impl ScanLogReplayProcessor {
84106 state_info,
85107 } )
86108 }
109+
110+ /// Serialize the processor state for distribution to executors.
111+ ///
112+ /// Consumes the processor and returns:
113+ /// - `SerializedScanState`: Public-facing state with predicate and internal blob
114+ /// - `HashSet<FileActionKey>`: The deduplication set (moved for independent use on executors)
115+ ///
116+ /// Executors can use `from_serialized` to reconstruct the processor with this state.
117+ pub ( crate ) fn serialize ( self ) -> DeltaResult < ( SerializedScanState , HashSet < FileActionKey > ) > {
118+ // Serialize internal state to JSON blob (schemas, transform spec, and column mapping mode)
119+ let internal_state = InternalState {
120+ logical_schema : ( * self . state_info . logical_schema ) . clone ( ) ,
121+ physical_schema : ( * self . state_info . physical_schema ) . clone ( ) ,
122+ transform_spec : self . state_info . transform_spec . as_ref ( ) . map ( |ts| ( * * ts) . clone ( ) ) ,
123+ column_mapping_mode : self . state_info . column_mapping_mode ,
124+ } ;
125+ let internal_state_blob = serde_json:: to_vec ( & internal_state)
126+ . map_err ( |e| Error :: generic ( format ! ( "Failed to serialize internal state: {}" , e) ) ) ?;
127+
128+ // Extract predicate from PhysicalPredicate
129+ let predicate = match & self . state_info . physical_predicate {
130+ PhysicalPredicate :: Some ( pred, _schema) => Some ( pred. clone ( ) ) ,
131+ _ => None ,
132+ } ;
133+
134+ let state = SerializedScanState {
135+ predicate,
136+ internal_state_blob,
137+ } ;
138+
139+ Ok ( ( state, self . seen_file_keys ) )
140+ }
141+
142+ /// Reconstruct a processor from serialized state.
143+ ///
144+ /// Creates a new processor with the provided state and seen_file_keys.
145+ /// All other fields (partition_filter, data_skipping_filter, add_transform) are
146+ /// reconstructed from the state and engine.
147+ ///
148+ /// # Parameters
149+ /// - `engine`: Engine for creating evaluators and filters
150+ /// - `state`: The serialized state from serialization
151+ /// - `seen_file_keys`: The deduplication set from serialization
152+ pub ( crate ) fn from_serialized (
153+ engine : & dyn Engine ,
154+ state : SerializedScanState ,
155+ seen_file_keys : HashSet < FileActionKey > ,
156+ ) -> DeltaResult < Self > {
157+ // Deserialize internal state
158+ let internal_state: InternalState = serde_json:: from_slice ( & state. internal_state_blob )
159+ . map_err ( |e| Error :: generic ( format ! ( "Failed to deserialize internal state: {}" , e) ) ) ?;
160+
161+ // Convert schemas to Arc
162+ let logical_schema = Arc :: new ( internal_state. logical_schema ) ;
163+ let physical_schema = Arc :: new ( internal_state. physical_schema ) ;
164+
165+ // Reconstruct PhysicalPredicate from predicate and schema
166+ let physical_predicate = match state. predicate {
167+ Some ( pred) => PhysicalPredicate :: Some ( pred, physical_schema. clone ( ) ) ,
168+ None => PhysicalPredicate :: None ,
169+ } ;
170+
171+ // Reconstruct StateInfo
172+ let state_info = Arc :: new ( StateInfo {
173+ logical_schema,
174+ physical_schema,
175+ physical_predicate,
176+ transform_spec : internal_state. transform_spec . map ( Arc :: new) ,
177+ column_mapping_mode : internal_state. column_mapping_mode ,
178+ } ) ;
179+
180+ // Create processor and set seen_file_keys
181+ let mut processor = Self :: new ( engine, state_info) ?;
182+ processor. seen_file_keys = seen_file_keys;
183+ Ok ( processor)
184+ }
185+
186+ /// Get the projected schema needed to read checkpoint/sidecar files.
187+ ///
188+ /// Returns the schema that should be used when reading leaf checkpoint files
189+ /// or sidecars during the executor phase.
190+ pub ( crate ) fn get_projected_schema ( & self ) -> DeltaResult < crate :: schema:: SchemaRef > {
191+ use crate :: actions:: ADD_NAME ;
192+ get_log_add_schema ( ) . project ( & [ ADD_NAME ] )
193+ }
87194}
88195
89196/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log
0 commit comments