benchmark: add graphing and bench script

sirupsen · erezsh · commit 73a5b1dc61f8 · 2022-07-01T10:16:46.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ jobs:
         run: pip install poetry
 
       - name: Install package
-        run: "poetry install && poetry run pip install unittest-parallel"
+        run: "poetry install"
 
       - name: Run unit tests
         env:
diff --git a/README.md b/README.md
@@ -171,9 +171,9 @@ Users can also install several drivers at once:
 Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]`
 
 See the [example command](#example-command-and-output) and the [sample
-connection strings](#supported-databases). 
+connection strings](#supported-databases).
 
-Note that for some databases, the arguments that you enter in the command line 
+Note that for some databases, the arguments that you enter in the command line
 may be case-sensitive. This is the case for the Snowflake schema and table names.
 
 Options:
@@ -423,11 +423,15 @@ $ docker-compose up -d mysql postgres # run mysql and postgres dbs in background
 
 **3. Run Unit Tests**
 
+There are more than 1000 tests for all the different type and database
+combinations, so we recommend using a parallel runner.
+
 ```shell-session
-$ poetry run python3 -m unittest
+$ poetry run unittest-parallel -j 16 #  run all tests
+$ poetry run python -m unittest -k <test> #  run individual test
 ```
 
-**4. Seed the Database(s)**
+**4. Seed the Database(s) (optional)**
 
 First, download the CSVs of seeding data:
 
@@ -451,7 +455,7 @@ $ poetry run preql -f dev/prepare_db.pql mssql://<uri>
 $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
 ```
 
-**5. Run **data-diff** against seeded database**
+**5. Run **data-diff** against seeded database (optional)**
 
 ```bash
 poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
@@ -460,7 +464,14 @@ poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgr
 **6. Run benchmarks (optional)**
 
 ```shell-session
-$ dev/benchmark.sh
+$ dev/benchmark.sh #  runs benchmarks and puts results in benchmark_<sha>.csv
+$ poetry run python3 dev/graph.py #  create graphs from benchmark_*.csv files
+```
+
+You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`:
+
+```shell-session
+$ N_SAMPLES=100000000 dev/benchmark.sh #  100m which is our canonical target
 ```
 
 
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -1,3 +1,5 @@
+import math
+
 from typing import Sequence, Optional, Tuple, Union, Dict, Any
 from uuid import UUID
 
@@ -38,3 +40,14 @@ def is_uuid(u):
     except ValueError:
         return False
     return True
+
+
+def number_to_human(n):
+    millnames = ["", "k", "m", "b"]
+    n = float(n)
+    millidx = max(
+        0,
+        min(len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3))),
+    )
+
+    return "{:.0f}{}".format(n / 10 ** (3 * millidx), millnames[millidx])
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,10 @@ snowflake-connector-python = "*"
 psycopg2 = "*"
 presto-python-client = "*"
 parameterized = "*"
+unittest-parallel = "*"
+pandas = "*" #  for generating benchmark graphs
+plotly = "*"
+kaleido = "*"
 
 [tool.poetry.extras]
 # When adding, update also: README + dev deps just above
diff --git a/tests/common.py b/tests/common.py
@@ -18,6 +18,7 @@
 DEFAULT_N_SAMPLES = 50
 N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
 BENCHMARK = os.environ.get("BENCHMARK", False)
+N_THREADS = int(os.environ.get("N_THREADS", 1))
 
 
 def get_git_revision_short_hash() -> str:
diff --git a/tests/test_database_types.py b/tests/test_database_types.py
@@ -12,11 +12,12 @@
 from parameterized import parameterized
 
 from data_diff import databases as db
+from data_diff.utils import number_to_human
 from data_diff.diff_tables import TableDiffer, TableSegment, DEFAULT_BISECTION_THRESHOLD
-from .common import CONN_STRINGS, N_SAMPLES, BENCHMARK, GIT_REVISION, random_table_suffix
+from .common import CONN_STRINGS, N_SAMPLES, N_THREADS, BENCHMARK, GIT_REVISION, random_table_suffix
 
 
-CONNS = {k: db.connect_to_uri(v, 1) for k, v in CONN_STRINGS.items()}
+CONNS = {k: db.connect_to_uri(v, N_THREADS) for k, v in CONN_STRINGS.items()}
 
 CONNS[db.MySQL].query("SET @@session.time_zone='+00:00'", None)
 
@@ -258,7 +259,6 @@ def __iter__(self):
         "int": [
             # all 38 digits with 0 precision, don't need to test all
             "int",
-            "integer",
             "bigint",
             # "smallint",
             # "tinyint",
@@ -385,17 +385,6 @@ def sanitize(name):
     return parameterized.to_safe_name(name)
 
 
-def number_to_human(n):
-    millnames = ["", "k", "m", "b"]
-    n = float(n)
-    millidx = max(
-        0,
-        min(len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3))),
-    )
-
-    return "{:.0f}{}".format(n / 10 ** (3 * millidx), millnames[millidx])
-
-
 # Pass --verbose to test run to get a nice output.
 def expand_params(testcase_func, param_num, param):
     source_db, target_db, source_type, target_type, type_category = param.args
@@ -431,6 +420,10 @@ def _insert_to_table(conn, table, values, type):
     if isinstance(conn, db.Oracle):
         default_insertion_query = f"INSERT INTO {table} (id, col)"
 
+    batch_size = 8000
+    if isinstance(conn, db.BigQuery):
+        batch_size = 1000
+
     insertion_query = default_insertion_query
     selects = []
     for j, sample in values:
@@ -453,7 +446,7 @@ def _insert_to_table(conn, table, values, type):
 
         # Some databases want small batch sizes...
         # Need to also insert on the last row, might not divide cleanly!
-        if j % 8000 == 0 or j == N_SAMPLES:
+        if j % batch_size == 0 or j == N_SAMPLES:
             if isinstance(conn, db.Oracle):
                 insertion_query += " UNION ALL ".join(selects)
                 conn.query(insertion_query, None)
@@ -594,7 +587,7 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
         # configuration with each segment being ~250k rows.
         ch_factor = min(max(int(N_SAMPLES / 250_000), 2), 128) if BENCHMARK else 2
         ch_threshold = min(DEFAULT_BISECTION_THRESHOLD, int(N_SAMPLES / ch_factor)) if BENCHMARK else 3
-        ch_threads = 1
+        ch_threads = N_THREADS
         differ = TableDiffer(
             bisection_threshold=ch_threshold,
             bisection_factor=ch_factor,
@@ -615,7 +608,7 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
         # parallel, using the existing implementation.
         dl_factor = max(int(N_SAMPLES / 100_000), 2) if BENCHMARK else 2
         dl_threshold = int(N_SAMPLES / dl_factor) + 1 if BENCHMARK else math.inf
-        dl_threads = 1
+        dl_threads = N_THREADS
         differ = TableDiffer(
             bisection_threshold=dl_threshold, bisection_factor=dl_factor, max_threadpool_size=dl_threads
         )
@@ -634,6 +627,7 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
             "git_revision": GIT_REVISION,
             "rows": N_SAMPLES,
             "rows_human": number_to_human(N_SAMPLES),
+            "name_human": f"{source_db.__name__}/{sanitize(source_type)} <-> {target_db.__name__}/{sanitize(target_type)}",
             "src_table": src_table[1:-1],  #  remove quotes
             "target_table": dst_table[1:-1],
             "source_type": source_type,
@@ -642,6 +636,7 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
             "insertion_target_sec": round(insertion_target_duration, 3),
             "count_source_sec": round(count_source_duration, 3),
             "count_target_sec": round(count_target_duration, 3),
+            "count_max_sec": max(round(count_target_duration, 3), round(count_source_duration, 3)),
             "checksum_sec": round(checksum_duration, 3),
             "download_sec": round(download_duration, 3),
             "download_bisection_factor": dl_factor,
@@ -655,7 +650,7 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
         if BENCHMARK:
             print(json.dumps(result, indent=2))
             file_name = f"benchmark_{GIT_REVISION}.jsonl"
-            with open(file_name, "a") as file:
+            with open(file_name, "a", encoding="utf-8") as file:
                 file.write(json.dumps(result) + "\n")
                 file.flush()
             print(f"Written to {file_name}")