11import csv
2- from dictdiffer import diff
32import json
4- import hashlib
5-
3+ import xxhash
4+ from dictdiffer import diff
65
76def load_csv (fp , key = None , dialect = None ):
7+ """
8+ Load a CSV file into a dictionary keyed by the given column or hash.
9+ """
810 if dialect is None and fp .seekable ():
911 # Peek at first 1MB to sniff the delimiter and other dialect details
1012 peek = fp .read (1024 ** 2 )
@@ -14,35 +16,35 @@ def load_csv(fp, key=None, dialect=None):
1416 except csv .Error :
1517 # Oh well, we tried. Fallback to the default.
1618 pass
17- fp = csv .reader (fp , dialect = (dialect or "excel" ))
18- headings = next (fp )
19- rows = [dict (zip (headings , line )) for line in fp ]
19+ reader = csv .reader (fp , dialect = (dialect or "excel" ))
20+ headings = next (reader )
21+ rows = [dict (zip (headings , line )) for line in reader ]
2022 if key :
2123 keyfn = lambda r : r [key ]
2224 else :
23- keyfn = lambda r : hashlib .sha1 (
24- json .dumps (r , sort_keys = True ).encode ("utf8" )
25- ).hexdigest ()
25+ keyfn = lambda r : xxhash .xxh64 (json .dumps (r , sort_keys = True ).encode ("utf8" )).hexdigest ()
2626 return {keyfn (r ): r for r in rows }
2727
28-
2928def load_json (fp , key = None ):
29+ """
30+ Load a JSON array of objects into a dictionary keyed by the given column or hash.
31+ """
3032 raw_list = json .load (fp )
31- assert isinstance (raw_list , list )
33+ if not isinstance (raw_list , list ):
34+ raise ValueError ("JSON file must contain a list of objects." )
3235 common_keys = set ()
3336 for item in raw_list :
3437 common_keys .update (item .keys ())
3538 if key :
3639 keyfn = lambda r : r [key ]
3740 else :
38- keyfn = lambda r : hashlib .sha1 (
39- json .dumps (r , sort_keys = True ).encode ("utf8" )
40- ).hexdigest ()
41+ keyfn = lambda r : xxhash .xxh64 (json .dumps (r , sort_keys = True ).encode ("utf8" )).hexdigest ()
4142 return {keyfn (r ): _simplify_json_row (r , common_keys ) for r in raw_list }
4243
43-
4444def _simplify_json_row (r , common_keys ):
45- # Convert list/dict values into JSON serialized strings
45+ """
46+ Ensure all rows have the same keys and serialize nested structures.
47+ """
4648 for key , value in r .items ():
4749 if isinstance (value , (dict , tuple , list )):
4850 r [key ] = json .dumps (value )
@@ -51,8 +53,10 @@ def _simplify_json_row(r, common_keys):
5153 r [key ] = None
5254 return r
5355
54-
5556def compare (previous , current , show_unchanged = False , fields = None , ignorefields = None ):
57+ """
58+ Compare two dictionaries of rows and return a diff summary.
59+ """
5660 result = {
5761 "added" : [],
5862 "removed" : [],
@@ -63,14 +67,13 @@ def compare(previous, current, show_unchanged=False, fields=None, ignorefields=N
6367 # Have the columns changed?
6468 previous_columns = set (next (iter (previous .values ())).keys ())
6569 current_columns = set (next (iter (current .values ())).keys ())
66- ignore_columns = None
6770
6871 # Apply fields/ignorefields filtering
6972 if fields :
7073 compare_columns = set (fields )
7174 elif ignorefields :
7275 compare_columns = previous_columns | current_columns
73- compare_columns = compare_columns - set (ignorefields )
76+ compare_columns -= set (ignorefields )
7477 else :
7578 compare_columns = previous_columns | current_columns
7679
@@ -122,20 +125,17 @@ def compare(previous, current, show_unchanged=False, fields=None, ignorefields=N
122125 result ["changed" ].append (changes )
123126 return result
124127
125-
126128def streaming_compare_csv (prev_path , curr_path , key , compare_columns = None , ignorefields = None , encoding = 'utf-8' , dialect = 'excel' ):
127129 """
128- Compare two sorted CSV files by streaming, returning a diff dict.
130+ Efficiently compare two sorted CSV files by streaming, returning a diff dict.
129131 """
130- import csv
131132 result = {
132133 "added" : [],
133134 "removed" : [],
134135 "changed" : [],
135136 "columns_added" : [],
136137 "columns_removed" : [],
137138 }
138-
139139 with open (prev_path , newline = '' , encoding = encoding ) as f1 , open (curr_path , newline = '' , encoding = encoding ) as f2 :
140140 reader1 = csv .DictReader (f1 , dialect = dialect )
141141 reader2 = csv .DictReader (f2 , dialect = dialect )
@@ -158,10 +158,8 @@ def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignor
158158
159159 while prev_row or curr_row :
160160 if prev_row and curr_row :
161- if key not in prev_row or key not in curr_row :
162- raise KeyError (f"Key column '{ key } ' missing in one of the rows." )
163- prev_key = str (prev_row [key ])
164- curr_key = str (curr_row [key ])
161+ prev_key = prev_row [key ]
162+ curr_key = curr_row [key ]
165163 if prev_key == curr_key :
166164 # Check for changes
167165 changed_fields = {
@@ -192,46 +190,40 @@ def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignor
192190 curr_row = next (reader2 , None )
193191 return result
194192
195-
196193def human_text (result , key = None , current = None , extras = None ):
194+ """
195+ Render the diff result as a human-readable string.
196+ """
197197 title = []
198198 summary = []
199- show_headers = sum (1 for key in result if result [key ]) > 1
199+ show_headers = sum (1 for k in result if result [k ]) > 1
200200 if result ["columns_added" ]:
201- fragment = "{} {} added" .format (
202- len (result ["columns_added" ]),
203- "column" if len (result ["columns_added" ]) == 1 else "columns" ,
204- )
201+ fragment = f"{ len (result ['columns_added' ])} { 'column' if len (result ['columns_added' ]) == 1 else 'columns' } added"
205202 title .append (fragment )
206203 summary .extend (
207204 [fragment , "" ]
208- + [" {}" . format ( c ) for c in sorted (result ["columns_added" ])]
205+ + [f " { c } " for c in sorted (result ["columns_added" ])]
209206 + ["" ]
210207 )
211208 if result ["columns_removed" ]:
212- fragment = "{} {} removed" .format (
213- len (result ["columns_removed" ]),
214- "column" if len (result ["columns_removed" ]) == 1 else "columns" ,
215- )
209+ fragment = f"{ len (result ['columns_removed' ])} { 'column' if len (result ['columns_removed' ]) == 1 else 'columns' } removed"
216210 title .append (fragment )
217211 summary .extend (
218212 [fragment , "" ]
219- + [" {}" . format ( c ) for c in sorted (result ["columns_removed" ])]
213+ + [f " { c } " for c in sorted (result ["columns_removed" ])]
220214 + ["" ]
221215 )
222216 if result ["changed" ]:
223- fragment = "{} rows changed" . format ( len (result [" changed" ]))
217+ fragment = f" { len (result [' changed' ]) } rows changed"
224218 title .append (fragment )
225219 if show_headers :
226220 summary .append (fragment + "\n " )
227221 change_blocks = []
228222 for details in result ["changed" ]:
229223 block = []
230- block .append (" {}: {}" . format ( key , details [" key" ]) )
224+ block .append (f " { key } : { details [' key' ] } " )
231225 for field , (prev_value , current_value ) in details ["changes" ].items ():
232- block .append (
233- ' {}: "{}" => "{}"' .format (field , prev_value , current_value )
234- )
226+ block .append (f' { field } : "{ prev_value } " => "{ current_value } "' )
235227 if extras :
236228 current_item = current [details ["key" ]]
237229 block .append (human_extras (current_item , extras ))
@@ -241,12 +233,12 @@ def human_text(result, key=None, current=None, extras=None):
241233 block = []
242234 block .append (" Unchanged:" )
243235 for field , value in details ["unchanged" ].items ():
244- block .append (' {}: "{}"' . format ( field , value ) )
236+ block .append (f ' { field } : "{ value } "' )
245237 block .append ("" )
246238 change_blocks .append ("\n " .join (block ))
247239 summary .append ("\n " .join (change_blocks ))
248240 if result ["added" ]:
249- fragment = "{} rows added" . format ( len (result [" added" ]))
241+ fragment = f" { len (result [' added' ]) } rows added"
250242 title .append (fragment )
251243 if show_headers :
252244 summary .append (fragment + "\n " )
@@ -259,7 +251,7 @@ def human_text(result, key=None, current=None, extras=None):
259251 summary .append ("\n \n " .join (rows ))
260252 summary .append ("" )
261253 if result ["removed" ]:
262- fragment = "{} rows removed" . format ( len (result [" removed" ]))
254+ fragment = f" { len (result [' removed' ]) } rows removed"
263255 title .append (fragment )
264256 if show_headers :
265257 summary .append (fragment + "\n " )
@@ -273,17 +265,17 @@ def human_text(result, key=None, current=None, extras=None):
273265 summary .append ("" )
274266 return (", " .join (title ) + "\n \n " + ("\n " .join (summary ))).strip ()
275267
276-
277268def human_row (row , prefix = "" ):
278- bits = []
279- for key , value in row .items ():
280- bits .append ("{}{}: {}" .format (prefix , key , value ))
281- return "\n " .join (bits )
282-
269+ """
270+ Render a row as a human-readable string.
271+ """
272+ return "\n " .join (f"{ prefix } { key } : { value } " for key , value in row .items ())
283273
284274def human_extras (row , extras ):
285- bits = []
286- bits .append (" extras:" )
275+ """
276+ Render extra fields for a row.
277+ """
278+ bits = [" extras:" ]
287279 for key , fmt in extras :
288- bits .append (" {}: {}" . format ( key , fmt .format (** row )) )
280+ bits .append (f " { key } : { fmt .format (** row )} " )
289281 return "\n " .join (bits )
0 commit comments