Re-wrote threading to use a thread-pool + priority queue.

erezsh · erezsh · commit af890da13ad5 · 2022-09-20T11:31:31.000+03:00
- KeyboardInterrupt is now handled correctly.
- Resulting iterator is now better behaved (--limit works a lot better)
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -197,7 +197,6 @@ def _main(
         bisection_threshold=bisection_threshold,
         threaded=threaded,
         max_threadpool_size=threads and threads * 2,
-        debug=debug,
     )
 
     if database1 is None or database2 is None:
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -13,6 +13,7 @@
 from runtype import dataclass
 
 from .utils import safezip, run_as_daemon
+from .thread_utils import ThreadedYielder
 from .databases.database_types import IKey, NumericType, PrecisionType, StringType
 from .table_segment import TableSegment
 from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
@@ -124,19 +125,22 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
                 f"size: {table1.approximate_size()}"
             )
 
+            ti = ThreadedYielder(self.max_threadpool_size)
             # Bisect (split) the table into segments, and diff them recursively.
-            yield from self._bisect_and_diff_tables(table1, table2)
+            ti.submit(self._bisect_and_diff_tables, ti, table1, table2)
 
             # Now we check for the second min-max, to diff the portions we "missed".
             min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
 
             if min_key2 < min_key1:
                 pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
-                yield from self._bisect_and_diff_tables(*pre_tables)
+                ti.submit(self._bisect_and_diff_tables, ti, *pre_tables)
 
             if max_key2 > max_key1:
                 post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
-                yield from self._bisect_and_diff_tables(*post_tables)
+                ti.submit(self._bisect_and_diff_tables, ti, *post_tables)
+
+            yield from ti
 
         except BaseException as e:  # Catch KeyboardInterrupt too
             error = e
@@ -218,7 +222,7 @@ def _validate_and_adjust_columns(self, table1, table2):
                         "If encoding/formatting differs between databases, it may result in false positives."
                     )
 
-    def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
+    def _bisect_and_diff_tables(self, ti: ThreadedYielder, table1, table2, level=0, max_rows=None):
         assert table1.is_bounded and table2.is_bounded
 
         if max_rows is None:
@@ -242,8 +246,7 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
 
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
-            yield from diff
-            return
+            return diff
 
         # Choose evenly spaced checkpoints (according to min_key and max_key)
         checkpoints = table1.choose_checkpoints(self.bisection_factor - 1)
@@ -253,15 +256,10 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
         segmented2 = table2.segment_by_checkpoints(checkpoints)
 
         # Recursively compare each pair of corresponding segments between table1 and table2
-        diff_iters = [
-            self._diff_tables(t1, t2, level + 1, i + 1, len(segmented1))
-            for i, (t1, t2) in enumerate(safezip(segmented1, segmented2))
-        ]
-
-        for res in self._thread_map(list, diff_iters):
-            yield from res
+        for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
+            ti.submit(self._diff_tables, ti, t1, t2, level + 1, i + 1, len(segmented1), priority=level)
 
-    def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_count=None):
+    def _diff_tables(self, ti: ThreadedYielder, table1, table2, level=0, segment_index=None, segment_count=None):
         logger.info(
             ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
             f"key-range: {table1.min_key}..{table2.max_key}, "
@@ -275,8 +273,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
         if BENCHMARK:
             max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
             if max_rows_from_keys < self.bisection_threshold:
-                yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
-                return
+                return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max_rows_from_keys)
 
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
@@ -293,7 +290,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
 
         if checksum1 != checksum2:
-            yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
+            return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max(count1, count2))
 
     def _thread_map(self, func, iterable):
         if not self.threaded:
diff --git a/data_diff/thread_utils.py b/data_diff/thread_utils.py
@@ -0,0 +1,80 @@
+import itertools
+from concurrent.futures.thread import _WorkItem
+from queue import PriorityQueue
+from collections import deque
+from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from time import sleep
+from typing import Callable, Iterator, Optional
+
+
+class AutoPriorityQueue(PriorityQueue):
+    """Overrides PriorityQueue to automatically get the priority from _WorkItem.kwargs
+
+    We also assign a unique id for each item, to avoid making comparisons on _WorkItem.
+    As a side effect, items with the same priority are returned FIFO.
+    """
+
+    _counter = itertools.count().__next__
+
+    def put(self, item: Optional[_WorkItem], block=True, timeout=None):
+        priority = item.kwargs.pop("priority") if item is not None else 0
+        super().put((-priority, self._counter(), item), block, timeout)
+
+    def get(self, block=True, timeout=None) -> Optional[_WorkItem]:
+        _p, _c, work_item = super().get(block, timeout)
+        return work_item
+
+
+class PriorityThreadPoolExecutor(ThreadPoolExecutor):
+    """Overrides ThreadPoolExecutor to use AutoPriorityQueue
+
+    XXX WARNING: Might break in future versions of Python
+    """
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+        self._work_queue = AutoPriorityQueue()
+
+
+class ThreadedYielder(Iterable):
+    """Yields results from multiple threads into a single iterator, ordered by priority.
+
+    To add a source iterator, call ``submit()`` with a function that returns an iterator.
+    Priority for the iterator can be provided via the keyword argument 'priority'. (higher runs first)
+    """
+
+    def __init__(self, max_workers: Optional[int] = None):
+        self._pool = PriorityThreadPoolExecutor(max_workers)
+        self._futures = deque()
+        self._yield = deque()
+        self._exception = None
+
+    def _worker(self, fn, *args, **kwargs):
+        try:
+            res = fn(*args, **kwargs)
+            if res is not None:
+                self._yield += res
+        except Exception as e:
+            self._exception = e
+
+    def submit(self, fn: Callable, *args, priority: int = 0, **kwargs):
+        self._futures.append(self._pool.submit(self._worker, fn, *args, priority=priority, **kwargs))
+
+    def __iter__(self) -> Iterator:
+        while True:
+            if self._exception:
+                raise self._exception
+
+            while self._yield:
+                yield self._yield.popleft()
+
+            if not self._futures:
+                # No more tasks
+                return
+
+            if self._futures[0].done():
+                self._futures.popleft()
+            else:
+                sleep(0.001)

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,6 @@ def _main(`
`197`	`197`	`bisection_threshold=bisection_threshold,`
`198`	`198`	`threaded=threaded,`
`199`	`199`	`max_threadpool_size=threads and threads * 2,`
`200`		`- debug=debug,`
`201`	`200`	`)`
`202`	`201`
`203`	`202`	`if database1 is None or database2 is None:`