|
16 | 16 | // under the License. |
17 | 17 |
|
18 | 18 | use std::collections::HashSet; |
| 19 | +use std::sync::Arc; |
19 | 20 |
|
| 21 | +use futures::{Sink, SinkExt}; |
| 22 | +use futures::future::try_join_all; |
| 23 | +use once_cell::sync::Lazy; |
| 24 | + |
| 25 | +use crate::delete_file_index::DeleteFileIndex; |
20 | 26 | use crate::error::Result; |
21 | | -use crate::spec::{ManifestContentType, ManifestFile, Operation, SnapshotRef}; |
| 27 | +use crate::scan::DeleteFileContext; |
| 28 | +use crate::spec::{ |
| 29 | + DataFile, ManifestContentType, ManifestFile, Operation, SnapshotRef, |
| 30 | +}; |
22 | 31 | use crate::table::Table; |
| 32 | +use crate::util::snapshot::ancestors_between; |
| 33 | +use crate::{Error, ErrorKind}; |
| 34 | + |
| 35 | +static VALIDATE_ADDED_DELETE_FILES_OPERATIONS: Lazy<HashSet<Operation>> = |
| 36 | + Lazy::new(|| HashSet::from([Operation::Overwrite, Operation::Delete])); |
23 | 37 |
|
24 | 38 | pub(crate) trait SnapshotValidator { |
| 39 | + // todo doc |
| 40 | + // table: base table |
| 41 | + // snapshot: parent snapshot |
| 42 | + // usually snapshot is the latest snapshot of base table, unless it's non-main branch |
| 43 | + // but we don't support writing to branches as of now |
25 | 44 | fn validate(&self, _table: &Table, _snapshot: Option<&SnapshotRef>) -> Result<()> { |
26 | 45 | // todo: add default implementation |
27 | 46 | Ok(()) |
28 | 47 | } |
29 | 48 |
|
30 | | - #[allow(dead_code)] |
| 49 | + // todo doc |
31 | 50 | async fn validation_history( |
32 | 51 | &self, |
33 | | - _base: &Table, |
34 | | - _from_snapshot: Option<&SnapshotRef>, |
35 | | - _to_snapshot: &SnapshotRef, |
36 | | - _matching_operations: HashSet<Operation>, |
37 | | - _manifest_content_type: ManifestContentType, |
| 52 | + base: &Table, |
| 53 | + to_snapshot: SnapshotRef, // todo maybe the naming/variable order can be better, or just snapshot id is better? this is parent |
| 54 | + from_snapshot_id: Option<i64>, |
| 55 | + matching_operations: &HashSet<Operation>, |
| 56 | + manifest_content_type: ManifestContentType, |
38 | 57 | ) -> Result<(Vec<ManifestFile>, HashSet<i64>)> { |
39 | | - // todo: add default implementation |
40 | | - Ok((vec![], HashSet::new())) |
| 58 | + let mut manifests: Vec<ManifestFile> = vec![]; |
| 59 | + let mut new_snapshots = HashSet::new(); |
| 60 | + let mut last_snapshot: Option<SnapshotRef> = None; |
| 61 | + |
| 62 | + let snapshots = ancestors_between( |
| 63 | + &Arc::new(base.metadata().clone()), |
| 64 | + to_snapshot.snapshot_id(), |
| 65 | + from_snapshot_id.clone(), |
| 66 | + ); |
| 67 | + |
| 68 | + for current_snapshot in snapshots { |
| 69 | + last_snapshot = Some(current_snapshot.clone()); |
| 70 | + |
| 71 | + // Find all snapshots with the matching operations |
| 72 | + // and their manifest files with the matching content type |
| 73 | + if matching_operations.contains(¤t_snapshot.summary().operation) { |
| 74 | + new_snapshots.insert(current_snapshot.snapshot_id()); |
| 75 | + current_snapshot |
| 76 | + .load_manifest_list(base.file_io(), base.metadata()) |
| 77 | + .await? |
| 78 | + .entries() |
| 79 | + .iter() |
| 80 | + .for_each(|manifest| { |
| 81 | + if manifest.content == manifest_content_type |
| 82 | + && manifest.added_snapshot_id == current_snapshot.snapshot_id() |
| 83 | + { |
| 84 | + manifests.push(manifest.clone()); |
| 85 | + } |
| 86 | + }); |
| 87 | + } |
| 88 | + } |
| 89 | + |
| 90 | + if last_snapshot.is_some() |
| 91 | + && last_snapshot.clone().unwrap().parent_snapshot_id() != from_snapshot_id |
| 92 | + { |
| 93 | + return Err(Error::new( |
| 94 | + ErrorKind::DataInvalid, |
| 95 | + format!( |
| 96 | + "Cannot determine history between starting snapshot {} and the last known ancestor {}", |
| 97 | + from_snapshot_id.unwrap_or(-1), |
| 98 | + last_snapshot.unwrap().snapshot_id() |
| 99 | + ), |
| 100 | + )); |
| 101 | + } |
| 102 | + |
| 103 | + Ok((manifests, new_snapshots)) |
| 104 | + } |
| 105 | + |
| 106 | + #[allow(dead_code)] |
| 107 | + async fn validate_no_new_delete_files_for_data_files( |
| 108 | + &self, |
| 109 | + base: &Table, |
| 110 | + from_snapshot_id: Option<i64>, |
| 111 | + _data_files: &[DataFile], |
| 112 | + to_snapshot: SnapshotRef, |
| 113 | + ) -> Result<()> { |
| 114 | + // Get matching delete files have been added since the from_snapshot_id |
| 115 | + let (delete_manifests, snapshot_ids) = self |
| 116 | + .validation_history( |
| 117 | + base, |
| 118 | + to_snapshot, |
| 119 | + from_snapshot_id, |
| 120 | + &VALIDATE_ADDED_DELETE_FILES_OPERATIONS, |
| 121 | + ManifestContentType::Deletes, |
| 122 | + ) |
| 123 | + .await?; |
| 124 | + |
| 125 | + // Building delete file index |
| 126 | + let (_delete_file_index, mut delete_file_tx) = DeleteFileIndex::new(); |
| 127 | + let manifests = try_join_all( |
| 128 | + delete_manifests |
| 129 | + .iter() |
| 130 | + .map(|f| f.load_manifest(base.file_io())) |
| 131 | + .collect::<Vec<_>>(), |
| 132 | + ) |
| 133 | + .await?; |
| 134 | + |
| 135 | + let delete_files_ctx = manifests |
| 136 | + .iter() |
| 137 | + .flat_map(|manifest| manifest.entries()) |
| 138 | + .map(|entry| DeleteFileContext { |
| 139 | + manifest_entry: entry.clone(), |
| 140 | + partition_spec_id: entry.data_file().partition_spec_id |
| 141 | + }).collect::<Vec<_>>(); |
| 142 | + |
| 143 | + for ctx in delete_files_ctx { |
| 144 | + delete_file_tx.send(ctx).await? |
| 145 | + } |
| 146 | + |
| 147 | + // todo validate if there are deletes |
| 148 | + |
| 149 | + Ok(()) |
41 | 150 | } |
42 | 151 | } |
0 commit comments