@@ -289,6 +289,129 @@ def __repr__(self) -> str:
289289 doc = "ID representing sort order for this file" ,
290290 ),
291291 ),
292+ 3 : StructType (
293+ NestedField (
294+ field_id = 134 ,
295+ name = "content" ,
296+ field_type = IntegerType (),
297+ required = True ,
298+ doc = "File format name: avro, orc, or parquet" ,
299+ initial_default = DataFileContent .DATA ,
300+ ),
301+ NestedField (field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme" ),
302+ NestedField (
303+ field_id = 101 ,
304+ name = "file_format" ,
305+ field_type = StringType (),
306+ required = True ,
307+ doc = "File format name: avro, orc, or parquet" ,
308+ ),
309+ NestedField (
310+ field_id = 102 ,
311+ name = "partition" ,
312+ field_type = StructType (),
313+ required = True ,
314+ doc = "Partition data tuple, schema based on the partition spec" ,
315+ ),
316+ NestedField (field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file" ),
317+ NestedField (
318+ field_id = 104 , name = "file_size_in_bytes" , field_type = LongType (), required = True , doc = "Total file size in bytes"
319+ ),
320+ NestedField (
321+ field_id = 108 ,
322+ name = "column_sizes" ,
323+ field_type = MapType (key_id = 117 , key_type = IntegerType (), value_id = 118 , value_type = LongType ()),
324+ required = False ,
325+ doc = "Map of column id to total size on disk" ,
326+ ),
327+ NestedField (
328+ field_id = 109 ,
329+ name = "value_counts" ,
330+ field_type = MapType (key_id = 119 , key_type = IntegerType (), value_id = 120 , value_type = LongType ()),
331+ required = False ,
332+ doc = "Map of column id to total count, including null and NaN" ,
333+ ),
334+ NestedField (
335+ field_id = 110 ,
336+ name = "null_value_counts" ,
337+ field_type = MapType (key_id = 121 , key_type = IntegerType (), value_id = 122 , value_type = LongType ()),
338+ required = False ,
339+ doc = "Map of column id to null value count" ,
340+ ),
341+ NestedField (
342+ field_id = 137 ,
343+ name = "nan_value_counts" ,
344+ field_type = MapType (key_id = 138 , key_type = IntegerType (), value_id = 139 , value_type = LongType ()),
345+ required = False ,
346+ doc = "Map of column id to number of NaN values in the column" ,
347+ ),
348+ NestedField (
349+ field_id = 125 ,
350+ name = "lower_bounds" ,
351+ field_type = MapType (key_id = 126 , key_type = IntegerType (), value_id = 127 , value_type = BinaryType ()),
352+ required = False ,
353+ doc = "Map of column id to lower bound" ,
354+ ),
355+ NestedField (
356+ field_id = 128 ,
357+ name = "upper_bounds" ,
358+ field_type = MapType (key_id = 129 , key_type = IntegerType (), value_id = 130 , value_type = BinaryType ()),
359+ required = False ,
360+ doc = "Map of column id to upper bound" ,
361+ ),
362+ NestedField (
363+ field_id = 131 , name = "key_metadata" , field_type = BinaryType (), required = False , doc = "Encryption key metadata blob"
364+ ),
365+ NestedField (
366+ field_id = 132 ,
367+ name = "split_offsets" ,
368+ field_type = ListType (element_id = 133 , element_type = LongType (), element_required = True ),
369+ required = False ,
370+ doc = "Splittable offsets" ,
371+ ),
372+ NestedField (
373+ field_id = 135 ,
374+ name = "equality_ids" ,
375+ field_type = ListType (element_id = 136 , element_type = LongType (), element_required = True ),
376+ required = False ,
377+ doc = "Field ids used to determine row equality in equality delete files." ,
378+ ),
379+ NestedField (
380+ field_id = 140 ,
381+ name = "sort_order_id" ,
382+ field_type = IntegerType (),
383+ required = False ,
384+ doc = "ID representing sort order for this file" ,
385+ ),
386+ NestedField (
387+ field_id = 142 ,
388+ name = "first_row_id" ,
389+ field_type = LongType (),
390+ required = False ,
391+ doc = "The _row_id for the first row in the data file." ,
392+ ),
393+ NestedField (
394+ field_id = 143 ,
395+ name = "referenced_data_file" ,
396+ field_type = StringType (),
397+ required = False ,
398+ doc = "Fully qualified location (URI with FS scheme) of a data file that all deletes reference" ,
399+ ),
400+ NestedField (
401+ field_id = 144 ,
402+ name = "content_offset" ,
403+ field_type = LongType (),
404+ required = False ,
405+ doc = "The offset in the file where the content starts." ,
406+ ),
407+ NestedField (
408+ field_id = 145 ,
409+ name = "content_size_in_bytes" ,
410+ field_type = LongType (),
411+ required = False ,
412+ doc = "The length of a referenced content stored in the file; required if content_offset is present" ,
413+ ),
414+ ),
292415}
293416
294417
@@ -434,6 +557,13 @@ def __eq__(self, other: Any) -> bool:
434557 NestedField (4 , "file_sequence_number" , LongType (), required = False ),
435558 NestedField (2 , "data_file" , DATA_FILE_TYPE [2 ], required = True ),
436559 ),
560+ 3 : Schema (
561+ NestedField (0 , "status" , IntegerType (), required = True ),
562+ NestedField (1 , "snapshot_id" , LongType (), required = False ),
563+ NestedField (3 , "sequence_number" , LongType (), required = False ),
564+ NestedField (4 , "file_sequence_number" , LongType (), required = False ),
565+ NestedField (2 , "data_file" , DATA_FILE_TYPE [3 ], required = True ),
566+ ),
437567}
438568
439569MANIFEST_ENTRY_SCHEMAS_STRUCT = {format_version : schema .as_struct () for format_version , schema in MANIFEST_ENTRY_SCHEMAS .items ()}
@@ -604,6 +734,24 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition
604734 NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
605735 NestedField (519 , "key_metadata" , BinaryType (), required = False ),
606736 ),
737+ 3 : Schema (
738+ NestedField (500 , "manifest_path" , StringType (), required = True , doc = "Location URI with FS scheme" ),
739+ NestedField (501 , "manifest_length" , LongType (), required = True ),
740+ NestedField (502 , "partition_spec_id" , IntegerType (), required = True ),
741+ NestedField (517 , "content" , IntegerType (), required = True , initial_default = ManifestContent .DATA ),
742+ NestedField (515 , "sequence_number" , LongType (), required = True , initial_default = 0 ),
743+ NestedField (516 , "min_sequence_number" , LongType (), required = True , initial_default = 0 ),
744+ NestedField (503 , "added_snapshot_id" , LongType (), required = True ),
745+ NestedField (504 , "added_files_count" , IntegerType (), required = True ),
746+ NestedField (505 , "existing_files_count" , IntegerType (), required = True ),
747+ NestedField (506 , "deleted_files_count" , IntegerType (), required = True ),
748+ NestedField (512 , "added_rows_count" , LongType (), required = True ),
749+ NestedField (513 , "existing_rows_count" , LongType (), required = True ),
750+ NestedField (514 , "deleted_rows_count" , LongType (), required = True ),
751+ NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
752+ NestedField (519 , "key_metadata" , BinaryType (), required = False ),
753+ NestedField (520 , "first_row_id" , LongType (), required = False ),
754+ ),
607755}
608756
609757MANIFEST_LIST_FILE_STRUCTS = {format_version : schema .as_struct () for format_version , schema in MANIFEST_LIST_FILE_SCHEMAS .items ()}
0 commit comments