1818use std:: collections:: HashSet ;
1919use std:: sync:: Arc ;
2020
21+ use futures:: SinkExt ;
2122use futures:: future:: try_join_all;
22- use futures:: { Sink , SinkExt } ;
2323use once_cell:: sync:: Lazy ;
2424
2525use crate :: delete_file_index:: DeleteFileIndex ;
2626use crate :: error:: Result ;
2727use crate :: scan:: DeleteFileContext ;
28- use crate :: spec:: { DataFile , ManifestContentType , ManifestFile , Operation , SnapshotRef } ;
28+ use crate :: spec:: {
29+ DataContentType , DataFile , FormatVersion , INITIAL_SEQUENCE_NUMBER , ManifestContentType ,
30+ ManifestFile , Operation , SnapshotRef ,
31+ } ;
2932use crate :: table:: Table ;
3033use crate :: util:: snapshot:: ancestors_between;
3134use crate :: { Error , ErrorKind } ;
@@ -39,7 +42,7 @@ pub(crate) trait SnapshotValidator {
3942 // snapshot: parent snapshot
4043 // usually snapshot is the latest snapshot of base table, unless it's non-main branch
4144 // but we don't support writing to branches as of now
42- fn validate ( & self , _table : & Table , _snapshot : Option < & SnapshotRef > ) -> Result < ( ) > {
45+ async fn validate ( & self , _base : & Table , _parent_snapshot_id : Option < i64 > ) -> Result < ( ) > {
4346 // todo: add default implementation
4447 Ok ( ( ) )
4548 }
@@ -48,8 +51,8 @@ pub(crate) trait SnapshotValidator {
4851 async fn validation_history (
4952 & self ,
5053 base : & Table ,
51- to_snapshot : SnapshotRef , // todo maybe the naming/variable order can be better, or just snapshot id is better? this is parent
5254 from_snapshot_id : Option < i64 > ,
55+ to_snapshot_id : i64 ,
5356 matching_operations : & HashSet < Operation > ,
5457 manifest_content_type : ManifestContentType ,
5558 ) -> Result < ( Vec < ManifestFile > , HashSet < i64 > ) > {
@@ -59,7 +62,7 @@ pub(crate) trait SnapshotValidator {
5962
6063 let snapshots = ancestors_between (
6164 & Arc :: new ( base. metadata ( ) . clone ( ) ) ,
62- to_snapshot . snapshot_id ( ) ,
65+ to_snapshot_id ,
6366 from_snapshot_id. clone ( ) ,
6467 ) ;
6568
@@ -101,49 +104,95 @@ pub(crate) trait SnapshotValidator {
101104 Ok ( ( manifests, new_snapshots) )
102105 }
103106
104- #[ allow( dead_code) ]
105107 async fn validate_no_new_delete_files_for_data_files (
106108 & self ,
107109 base : & Table ,
108110 from_snapshot_id : Option < i64 > ,
109- _data_files : & [ DataFile ] ,
110- to_snapshot : SnapshotRef ,
111+ to_snapshot_id : Option < i64 > ,
112+ data_files : & [ DataFile ] ,
113+ ignore_equality_deletes : bool ,
111114 ) -> Result < ( ) > {
115+ // If there is no current table state, no files have been added
116+ if to_snapshot_id. is_none ( ) || base. metadata ( ) . format_version ( ) != FormatVersion :: V1 {
117+ return Ok ( ( ) ) ;
118+ }
119+ let to_snapshot_id = to_snapshot_id. unwrap ( ) ;
120+
112121 // Get matching delete files have been added since the from_snapshot_id
113- let ( delete_manifests, snapshot_ids ) = self
122+ let ( delete_manifests, _ ) = self
114123 . validation_history (
115124 base,
116- to_snapshot,
117125 from_snapshot_id,
126+ to_snapshot_id,
118127 & VALIDATE_ADDED_DELETE_FILES_OPERATIONS ,
119128 ManifestContentType :: Deletes ,
120129 )
121130 . await ?;
122131
123- // Building delete file index
124- let ( _delete_file_index , mut delete_file_tx) = DeleteFileIndex :: new ( ) ;
132+ // Build delete file index
133+ let ( delete_file_index , mut delete_file_tx) = DeleteFileIndex :: new ( ) ;
125134 let manifests = try_join_all (
126135 delete_manifests
127136 . iter ( )
128137 . map ( |f| f. load_manifest ( base. file_io ( ) ) )
129138 . collect :: < Vec < _ > > ( ) ,
130139 )
131140 . await ?;
132-
133- let delete_files_ctx = manifests
134- . iter ( )
135- . flat_map ( |manifest| manifest. entries ( ) )
136- . map ( |entry| DeleteFileContext {
141+ let manifest_entries = manifests. iter ( ) . flat_map ( |manifest| manifest. entries ( ) ) ;
142+ for entry in manifest_entries {
143+ let delete_file_ctx = DeleteFileContext {
137144 manifest_entry : entry. clone ( ) ,
138145 partition_spec_id : entry. data_file ( ) . partition_spec_id ,
139- } )
140- . collect :: < Vec < _ > > ( ) ;
141-
142- for ctx in delete_files_ctx {
143- delete_file_tx. send ( ctx) . await ?
146+ } ;
147+ delete_file_tx. send ( delete_file_ctx) . await ?;
144148 }
145149
146- // todo validate if there are deletes
150+ // Get starting seq num from starting snapshot if available
151+ let starting_sequence_number = if from_snapshot_id. is_some ( )
152+ && base
153+ . metadata ( )
154+ . snapshots
155+ . get ( & from_snapshot_id. unwrap ( ) )
156+ . is_some ( )
157+ {
158+ base. metadata ( )
159+ . snapshots
160+ . get ( & from_snapshot_id. unwrap ( ) )
161+ . unwrap ( )
162+ . sequence_number ( )
163+ } else {
164+ INITIAL_SEQUENCE_NUMBER
165+ } ;
166+
167+ // Validate if there are deletes using delete file index
168+ for data_file in data_files {
169+ let delete_files = delete_file_index
170+ . get_deletes_for_data_file ( data_file, Some ( starting_sequence_number) )
171+ . await ;
172+
173+ if ignore_equality_deletes {
174+ if delete_files
175+ . iter ( )
176+ . any ( |delete_file| delete_file. file_type == DataContentType :: PositionDeletes )
177+ {
178+ return Err ( Error :: new (
179+ ErrorKind :: DataInvalid ,
180+ format ! (
181+ "Cannot commit, found new positional delete for added data file: {}" ,
182+ data_file. file_path
183+ ) ,
184+ ) ) ;
185+ }
186+ } else if !delete_files. is_empty ( ) {
187+ return Err ( Error :: new (
188+ ErrorKind :: DataInvalid ,
189+ format ! (
190+ "Cannot commit, found new delete for added data file: {}" ,
191+ data_file. file_path
192+ ) ,
193+ ) ) ;
194+ }
195+ }
147196
148197 Ok ( ( ) )
149198 }
0 commit comments