Skip to content

Commit a80ac5b

Browse files
authored
Updated hashing algorithm, made streaming more efficient (hopefully), also refactored lots of code (hopefully nothing is broken)
1 parent 289bd52 commit a80ac5b

File tree

2 files changed

+131
-102
lines changed

2 files changed

+131
-102
lines changed

db_diff/__init__.py

Lines changed: 48 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import csv
2-
from dictdiffer import diff
32
import json
4-
import hashlib
5-
3+
import xxhash
4+
from dictdiffer import diff
65

76
def load_csv(fp, key=None, dialect=None):
7+
"""
8+
Load a CSV file into a dictionary keyed by the given column or hash.
9+
"""
810
if dialect is None and fp.seekable():
911
# Peek at first 1MB to sniff the delimiter and other dialect details
1012
peek = fp.read(1024**2)
@@ -14,35 +16,35 @@ def load_csv(fp, key=None, dialect=None):
1416
except csv.Error:
1517
# Oh well, we tried. Fallback to the default.
1618
pass
17-
fp = csv.reader(fp, dialect=(dialect or "excel"))
18-
headings = next(fp)
19-
rows = [dict(zip(headings, line)) for line in fp]
19+
reader = csv.reader(fp, dialect=(dialect or "excel"))
20+
headings = next(reader)
21+
rows = [dict(zip(headings, line)) for line in reader]
2022
if key:
2123
keyfn = lambda r: r[key]
2224
else:
23-
keyfn = lambda r: hashlib.sha1(
24-
json.dumps(r, sort_keys=True).encode("utf8")
25-
).hexdigest()
25+
keyfn = lambda r: xxhash.xxh64(json.dumps(r, sort_keys=True).encode("utf8")).hexdigest()
2626
return {keyfn(r): r for r in rows}
2727

28-
2928
def load_json(fp, key=None):
29+
"""
30+
Load a JSON array of objects into a dictionary keyed by the given column or hash.
31+
"""
3032
raw_list = json.load(fp)
31-
assert isinstance(raw_list, list)
33+
if not isinstance(raw_list, list):
34+
raise ValueError("JSON file must contain a list of objects.")
3235
common_keys = set()
3336
for item in raw_list:
3437
common_keys.update(item.keys())
3538
if key:
3639
keyfn = lambda r: r[key]
3740
else:
38-
keyfn = lambda r: hashlib.sha1(
39-
json.dumps(r, sort_keys=True).encode("utf8")
40-
).hexdigest()
41+
keyfn = lambda r: xxhash.xxh64(json.dumps(r, sort_keys=True).encode("utf8")).hexdigest()
4142
return {keyfn(r): _simplify_json_row(r, common_keys) for r in raw_list}
4243

43-
4444
def _simplify_json_row(r, common_keys):
45-
# Convert list/dict values into JSON serialized strings
45+
"""
46+
Ensure all rows have the same keys and serialize nested structures.
47+
"""
4648
for key, value in r.items():
4749
if isinstance(value, (dict, tuple, list)):
4850
r[key] = json.dumps(value)
@@ -51,8 +53,10 @@ def _simplify_json_row(r, common_keys):
5153
r[key] = None
5254
return r
5355

54-
5556
def compare(previous, current, show_unchanged=False, fields=None, ignorefields=None):
57+
"""
58+
Compare two dictionaries of rows and return a diff summary.
59+
"""
5660
result = {
5761
"added": [],
5862
"removed": [],
@@ -63,14 +67,13 @@ def compare(previous, current, show_unchanged=False, fields=None, ignorefields=N
6367
# Have the columns changed?
6468
previous_columns = set(next(iter(previous.values())).keys())
6569
current_columns = set(next(iter(current.values())).keys())
66-
ignore_columns = None
6770

6871
# Apply fields/ignorefields filtering
6972
if fields:
7073
compare_columns = set(fields)
7174
elif ignorefields:
7275
compare_columns = previous_columns | current_columns
73-
compare_columns = compare_columns - set(ignorefields)
76+
compare_columns -= set(ignorefields)
7477
else:
7578
compare_columns = previous_columns | current_columns
7679

@@ -122,20 +125,17 @@ def compare(previous, current, show_unchanged=False, fields=None, ignorefields=N
122125
result["changed"].append(changes)
123126
return result
124127

125-
126128
def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignorefields=None, encoding='utf-8', dialect='excel'):
127129
"""
128-
Compare two sorted CSV files by streaming, returning a diff dict.
130+
Efficiently compare two sorted CSV files by streaming, returning a diff dict.
129131
"""
130-
import csv
131132
result = {
132133
"added": [],
133134
"removed": [],
134135
"changed": [],
135136
"columns_added": [],
136137
"columns_removed": [],
137138
}
138-
139139
with open(prev_path, newline='', encoding=encoding) as f1, open(curr_path, newline='', encoding=encoding) as f2:
140140
reader1 = csv.DictReader(f1, dialect=dialect)
141141
reader2 = csv.DictReader(f2, dialect=dialect)
@@ -158,10 +158,8 @@ def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignor
158158

159159
while prev_row or curr_row:
160160
if prev_row and curr_row:
161-
if key not in prev_row or key not in curr_row:
162-
raise KeyError(f"Key column '{key}' missing in one of the rows.")
163-
prev_key = str(prev_row[key])
164-
curr_key = str(curr_row[key])
161+
prev_key = prev_row[key]
162+
curr_key = curr_row[key]
165163
if prev_key == curr_key:
166164
# Check for changes
167165
changed_fields = {
@@ -192,46 +190,40 @@ def streaming_compare_csv(prev_path, curr_path, key, compare_columns=None, ignor
192190
curr_row = next(reader2, None)
193191
return result
194192

195-
196193
def human_text(result, key=None, current=None, extras=None):
194+
"""
195+
Render the diff result as a human-readable string.
196+
"""
197197
title = []
198198
summary = []
199-
show_headers = sum(1 for key in result if result[key]) > 1
199+
show_headers = sum(1 for k in result if result[k]) > 1
200200
if result["columns_added"]:
201-
fragment = "{} {} added".format(
202-
len(result["columns_added"]),
203-
"column" if len(result["columns_added"]) == 1 else "columns",
204-
)
201+
fragment = f"{len(result['columns_added'])} {'column' if len(result['columns_added']) == 1 else 'columns'} added"
205202
title.append(fragment)
206203
summary.extend(
207204
[fragment, ""]
208-
+ [" {}".format(c) for c in sorted(result["columns_added"])]
205+
+ [f" {c}" for c in sorted(result["columns_added"])]
209206
+ [""]
210207
)
211208
if result["columns_removed"]:
212-
fragment = "{} {} removed".format(
213-
len(result["columns_removed"]),
214-
"column" if len(result["columns_removed"]) == 1 else "columns",
215-
)
209+
fragment = f"{len(result['columns_removed'])} {'column' if len(result['columns_removed']) == 1 else 'columns'} removed"
216210
title.append(fragment)
217211
summary.extend(
218212
[fragment, ""]
219-
+ [" {}".format(c) for c in sorted(result["columns_removed"])]
213+
+ [f" {c}" for c in sorted(result["columns_removed"])]
220214
+ [""]
221215
)
222216
if result["changed"]:
223-
fragment = "{} rows changed".format(len(result["changed"]))
217+
fragment = f"{len(result['changed'])} rows changed"
224218
title.append(fragment)
225219
if show_headers:
226220
summary.append(fragment + "\n")
227221
change_blocks = []
228222
for details in result["changed"]:
229223
block = []
230-
block.append(" {}: {}".format(key, details["key"]))
224+
block.append(f" {key}: {details['key']}")
231225
for field, (prev_value, current_value) in details["changes"].items():
232-
block.append(
233-
' {}: "{}" => "{}"'.format(field, prev_value, current_value)
234-
)
226+
block.append(f' {field}: "{prev_value}" => "{current_value}"')
235227
if extras:
236228
current_item = current[details["key"]]
237229
block.append(human_extras(current_item, extras))
@@ -241,12 +233,12 @@ def human_text(result, key=None, current=None, extras=None):
241233
block = []
242234
block.append(" Unchanged:")
243235
for field, value in details["unchanged"].items():
244-
block.append(' {}: "{}"'.format(field, value))
236+
block.append(f' {field}: "{value}"')
245237
block.append("")
246238
change_blocks.append("\n".join(block))
247239
summary.append("\n".join(change_blocks))
248240
if result["added"]:
249-
fragment = "{} rows added".format(len(result["added"]))
241+
fragment = f"{len(result['added'])} rows added"
250242
title.append(fragment)
251243
if show_headers:
252244
summary.append(fragment + "\n")
@@ -259,7 +251,7 @@ def human_text(result, key=None, current=None, extras=None):
259251
summary.append("\n\n".join(rows))
260252
summary.append("")
261253
if result["removed"]:
262-
fragment = "{} rows removed".format(len(result["removed"]))
254+
fragment = f"{len(result['removed'])} rows removed"
263255
title.append(fragment)
264256
if show_headers:
265257
summary.append(fragment + "\n")
@@ -273,17 +265,17 @@ def human_text(result, key=None, current=None, extras=None):
273265
summary.append("")
274266
return (", ".join(title) + "\n\n" + ("\n".join(summary))).strip()
275267

276-
277268
def human_row(row, prefix=""):
278-
bits = []
279-
for key, value in row.items():
280-
bits.append("{}{}: {}".format(prefix, key, value))
281-
return "\n".join(bits)
282-
269+
"""
270+
Render a row as a human-readable string.
271+
"""
272+
return "\n".join(f"{prefix}{key}: {value}" for key, value in row.items())
283273

284274
def human_extras(row, extras):
285-
bits = []
286-
bits.append(" extras:")
275+
"""
276+
Render extra fields for a row.
277+
"""
278+
bits = [" extras:"]
287279
for key, fmt in extras:
288-
bits.append(" {}: {}".format(key, fmt.format(**row)))
280+
bits.append(f" {key}: {fmt.format(**row)}")
289281
return "\n".join(bits)

0 commit comments

Comments
 (0)