Merge pull request #243 from datafold/next_master

erezsh · web-flow · commit 67171cb7c2fb · 2022-09-28T12:16:02.000+03:00
Merge of #238 and #235
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -197,7 +197,6 @@ def _main(
         bisection_threshold=bisection_threshold,
         threaded=threaded,
         max_threadpool_size=threads and threads * 2,
-        debug=debug,
     )
 
     if database1 is None or database2 is None:
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -16,8 +16,10 @@
     Float,
     ColType_UUID,
     Native_UUID,
-    String_Alphanum,
     String_UUID,
+    String_Alphanum,
+    String_FixedAlphanum,
+    String_VaryingAlphanum,
     TemporalType,
     UnknownColType,
     Text,
@@ -79,6 +81,7 @@ class Database(AbstractDatabase):
 
     TYPE_CLASSES: Dict[str, type] = {}
     default_schema: str = None
+    SUPPORTS_ALPHANUMS = True
 
     @property
     def name(self):
@@ -229,23 +232,22 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
                     col_dict[col_name] = String_UUID()
                     continue
 
-            alphanum_samples = [s for s in samples if s and String_Alphanum.test_value(s)]
-            if alphanum_samples:
-                if len(alphanum_samples) != len(samples):
-                    logger.warning(
-                        f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
-                    )
-                else:
-                    assert col_name in col_dict
-                    lens = set(map(len, alphanum_samples))
-                    if len(lens) > 1:
+            if self.SUPPORTS_ALPHANUMS:  # Anything but MySQL (so far)
+                alphanum_samples = [s for s in samples if String_Alphanum.test_value(s)]
+                if alphanum_samples:
+                    if len(alphanum_samples) != len(samples):
                         logger.warning(
-                            f"Mixed Alphanum lengths detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
+                            f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}. It cannot be used as a key."
                         )
                     else:
-                        (length,) = lens
-                        col_dict[col_name] = String_Alphanum(length=length)
-                        continue
+                        assert col_name in col_dict
+                        lens = set(map(len, alphanum_samples))
+                        if len(lens) > 1:
+                            col_dict[col_name] = String_VaryingAlphanum()
+                        else:
+                            (length,) = lens
+                            col_dict[col_name] = String_FixedAlphanum(length=length)
+                            continue
 
     # @lru_cache()
     # def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -92,10 +92,7 @@ class String_UUID(StringType, ColType_UUID):
     pass
 
 
-@dataclass
 class String_Alphanum(StringType, ColType_Alphanum):
-    length: int
-
     @staticmethod
     def test_value(value: str) -> bool:
         try:
@@ -104,6 +101,18 @@ def test_value(value: str) -> bool:
         except ValueError:
             return False
 
+    def make_value(self, value):
+        return self.python_type(value)
+
+
+class String_VaryingAlphanum(String_Alphanum):
+    pass
+
+
+@dataclass
+class String_FixedAlphanum(String_Alphanum):
+    length: int
+
     def make_value(self, value):
         if len(value) != self.length:
             raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):
         "binary": Text,
     }
     ROUNDS_ON_PREC_LOSS = True
+    SUPPORTS_ALPHANUMS = False
 
     def __init__(self, *, thread_count, **kw):
         self._args = kw
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -13,6 +13,7 @@
 from runtype import dataclass
 
 from .utils import safezip, run_as_daemon
+from .thread_utils import ThreadedYielder
 from .databases.database_types import IKey, NumericType, PrecisionType, StringType
 from .table_segment import TableSegment
 from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
@@ -121,22 +122,25 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
             logger.info(
                 f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
                 f"key-range: {table1.min_key}..{table2.max_key}, "
-                f"size: {table1.approximate_size()}"
+                f"size: table1 <= {table1.approximate_size()}, table2 <= {table2.approximate_size()}"
             )
 
+            ti = ThreadedYielder(self.max_threadpool_size)
             # Bisect (split) the table into segments, and diff them recursively.
-            yield from self._bisect_and_diff_tables(table1, table2)
+            ti.submit(self._bisect_and_diff_tables, ti, table1, table2)
 
             # Now we check for the second min-max, to diff the portions we "missed".
             min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
 
             if min_key2 < min_key1:
                 pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
-                yield from self._bisect_and_diff_tables(*pre_tables)
+                ti.submit(self._bisect_and_diff_tables, ti, *pre_tables)
 
             if max_key2 > max_key1:
                 post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
-                yield from self._bisect_and_diff_tables(*post_tables)
+                ti.submit(self._bisect_and_diff_tables, ti, *post_tables)
+
+            yield from ti
 
         except BaseException as e:  # Catch KeyboardInterrupt too
             error = e
@@ -218,12 +222,12 @@ def _validate_and_adjust_columns(self, table1, table2):
                         "If encoding/formatting differs between databases, it may result in false positives."
                     )
 
-    def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
+    def _bisect_and_diff_tables(self, ti: ThreadedYielder, table1: TableSegment, table2: TableSegment, level=0, max_rows=None):
         assert table1.is_bounded and table2.is_bounded
 
         if max_rows is None:
             # We can be sure that row_count <= max_rows
-            max_rows = table1.max_key - table1.min_key
+            max_rows = max(table1.approximate_size(), table2.approximate_size())
 
         # If count is below the threshold, just download and compare the columns locally
         # This saves time, as bisection speed is limited by ping and query performance.
@@ -242,8 +246,7 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
 
             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
-            yield from diff
-            return
+            return diff
 
         # Choose evenly spaced checkpoints (according to min_key and max_key)
         checkpoints = table1.choose_checkpoints(self.bisection_factor - 1)
@@ -253,38 +256,31 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
         segmented2 = table2.segment_by_checkpoints(checkpoints)
 
         # Recursively compare each pair of corresponding segments between table1 and table2
-        diff_iters = [
-            self._diff_tables(t1, t2, level + 1, i + 1, len(segmented1))
-            for i, (t1, t2) in enumerate(safezip(segmented1, segmented2))
-        ]
+        for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
+            ti.submit(self._diff_tables, ti, t1, t2, max_rows, level + 1, i + 1, len(segmented1), priority=level)
 
-        for res in self._thread_map(list, diff_iters):
-            yield from res
-
-    def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_count=None):
+    def _diff_tables(self, ti: ThreadedYielder, table1: TableSegment, table2: TableSegment, max_rows: int, level=0, segment_index=None, segment_count=None):
         logger.info(
             ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
             f"key-range: {table1.min_key}..{table2.max_key}, "
-            f"size: {table2.max_key-table1.min_key}"
+            f"size <= {max_rows}"
         )
 
         # When benchmarking, we want the ability to skip checksumming. This
         # allows us to download all rows for comparison in performance. By
         # default, data-diff will checksum the section first (when it's below
         # the threshold) and _then_ download it.
         if BENCHMARK:
-            max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
-            if max_rows_from_keys < self.bisection_threshold:
-                yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
-                return
+            if max_rows < self.bisection_threshold:
+                return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max_rows)
 
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
         if count1 == 0 and count2 == 0:
-            logger.warning(
-                "Uneven distribution of keys detected. (big gaps in the key column). "
-                "For better performance, we recommend to increase the bisection-threshold."
-            )
+            # logger.warning(
+            #     f"Uneven distribution of keys detected in segment {table1.min_key}..{table2.max_key}. (big gaps in the key column). "
+            #     "For better performance, we recommend to increase the bisection-threshold."
+            # )
             assert checksum1 is None and checksum2 is None
             return
 
@@ -293,7 +289,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
 
         if checksum1 != checksum2:
-            yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
+            return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max(count1, count2))
 
     def _thread_map(self, func, iterable):
         if not self.threaded:
diff --git a/data_diff/table_segment.py b/data_diff/table_segment.py
@@ -4,7 +4,7 @@
 
 from runtype import dataclass
 
-from .utils import ArithString, split_space
+from .utils import ArithString, split_space, ArithAlphanumeric
 
 from .databases.base import Database
 from .databases.database_types import DbPath, DbKey, DbTime, Native_UUID, Schema, create_schema
@@ -149,8 +149,9 @@ def choose_checkpoints(self, count: int) -> List[DbKey]:
         assert self.is_bounded
         if isinstance(self.min_key, ArithString):
             assert type(self.min_key) is type(self.max_key)
-            checkpoints = split_space(self.min_key.int, self.max_key.int, count)
-            return [self.min_key.new(int=i) for i in checkpoints]
+            checkpoints = self.min_key.range(self.max_key, count)
+            assert all(self.min_key <= x <= self.max_key for x in checkpoints)
+            return checkpoints
 
         return split_space(self.min_key, self.max_key, count)
 
diff --git a/data_diff/thread_utils.py b/data_diff/thread_utils.py
@@ -0,0 +1,80 @@
+import itertools
+from concurrent.futures.thread import _WorkItem
+from queue import PriorityQueue
+from collections import deque
+from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from time import sleep
+from typing import Callable, Iterator, Optional
+
+
+class AutoPriorityQueue(PriorityQueue):
+    """Overrides PriorityQueue to automatically get the priority from _WorkItem.kwargs
+
+    We also assign a unique id for each item, to avoid making comparisons on _WorkItem.
+    As a side effect, items with the same priority are returned FIFO.
+    """
+
+    _counter = itertools.count().__next__
+
+    def put(self, item: Optional[_WorkItem], block=True, timeout=None):
+        priority = item.kwargs.pop("priority") if item is not None else 0
+        super().put((-priority, self._counter(), item), block, timeout)
+
+    def get(self, block=True, timeout=None) -> Optional[_WorkItem]:
+        _p, _c, work_item = super().get(block, timeout)
+        return work_item
+
+
+class PriorityThreadPoolExecutor(ThreadPoolExecutor):
+    """Overrides ThreadPoolExecutor to use AutoPriorityQueue
+
+    XXX WARNING: Might break in future versions of Python
+    """
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+        self._work_queue = AutoPriorityQueue()
+
+
+class ThreadedYielder(Iterable):
+    """Yields results from multiple threads into a single iterator, ordered by priority.
+
+    To add a source iterator, call ``submit()`` with a function that returns an iterator.
+    Priority for the iterator can be provided via the keyword argument 'priority'. (higher runs first)
+    """
+
+    def __init__(self, max_workers: Optional[int] = None):
+        self._pool = PriorityThreadPoolExecutor(max_workers)
+        self._futures = deque()
+        self._yield = deque()
+        self._exception = None
+
+    def _worker(self, fn, *args, **kwargs):
+        try:
+            res = fn(*args, **kwargs)
+            if res is not None:
+                self._yield += res
+        except Exception as e:
+            self._exception = e
+
+    def submit(self, fn: Callable, *args, priority: int = 0, **kwargs):
+        self._futures.append(self._pool.submit(self._worker, fn, *args, priority=priority, **kwargs))
+
+    def __iter__(self) -> Iterator:
+        while True:
+            if self._exception:
+                raise self._exception
+
+            while self._yield:
+                yield self._yield.popleft()
+
+            if not self._futures:
+                # No more tasks
+                return
+
+            if self._futures[0].done():
+                self._futures.popleft()
+            else:
+                sleep(0.001)
diff --git a/data_diff/utils.py b/data_diff/utils.py
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,6 @@ def _main(`
`197`	`197`	`bisection_threshold=bisection_threshold,`
`198`	`198`	`threaded=threaded,`
`199`	`199`	`max_threadpool_size=threads and threads * 2,`
`200`		`- debug=debug,`
`201`	`200`	`)`
`202`	`201`
`203`	`202`	`if database1 is None or database2 is None:`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):`
`28`	`28`	`"binary": Text,`
`29`	`29`	`}`
`30`	`30`	`ROUNDS_ON_PREC_LOSS = True`
	`31`	`+ SUPPORTS_ALPHANUMS = False`
`31`	`32`
`32`	`33`	`def __init__(self, , thread_count, *kw):`
`33`	`34`	`self._args = kw`