@@ -131,13 +131,15 @@ class JoinDiffer(TableDiffer):
131131 materialize_to_table (DbPath, optional): Path of new table to write diff results to. Disabled if not provided.
132132 materialize_all_rows (bool): Materialize every row, not just those that are different. (default: False)
133133 table_write_limit (int): Maximum number of rows to write when materializing, per thread.
134+ skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (default: False)
134135 """
135136
136137 validate_unique_key : bool = True
137138 sample_exclusive_rows : bool = False
138139 materialize_to_table : DbPath = None
139140 materialize_all_rows : bool = False
140141 table_write_limit : int = TABLE_WRITE_LIMIT
142+ skip_null_keys : bool = False
141143
142144 stats : dict = {}
143145
@@ -209,7 +211,11 @@ def _diff_segments(
209211 if is_xa and is_xb :
210212 # Can't both be exclusive, meaning a pk is NULL
211213 # This can happen if the explicit null test didn't finish running yet
212- raise ValueError ("NULL values in one or more primary keys" )
214+ if self .skip_null_keys :
215+ # warning is thrown in explicit null test
216+ continue
217+ else :
218+ raise ValueError ("NULL values in one or more primary keys" )
213219 # _is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
214220 _is_diff , ab_row = _slice_tuple (x , len (is_diff_cols ), len (a_cols ) + len (b_cols ))
215221 a_row , b_row = ab_row [::2 ], ab_row [1 ::2 ]
@@ -252,7 +258,12 @@ def _test_null_keys(self, table1, table2):
252258 q = t .select (* this [key_columns ]).where (or_ (this [k ] == None for k in key_columns ))
253259 nulls = ts .database .query (q , list )
254260 if nulls :
255- raise ValueError (f"NULL values in one or more primary keys of { ts .table_path } " )
261+ if self .skip_null_keys :
262+ logger .warning (
263+ f"NULL values in one or more primary keys of { ts .table_path } . Skipping rows with NULL keys."
264+ )
265+ else :
266+ raise ValueError (f"NULL values in one or more primary keys of { ts .table_path } " )
256267
257268 def _collect_stats (self , i , table_seg : TableSegment , info_tree : InfoTree ):
258269 logger .debug (f"Collecting stats for table #{ i } " )
0 commit comments