Add Trino connector

danthelion · danthelion · commit 706a34cb80e6 · 2022-07-12T20:32:54.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -138,3 +138,6 @@ benchmark_*.png
 
 # Mac
 .DS_Store
+
+# IntelliJ
+.idea
diff --git a/README.md b/README.md
@@ -125,8 +125,14 @@ $ data-diff \
 | BigQuery      | `bigquery://<project>/<dataset>`                                                                                                    |  💛    |
 | Redshift      | `redshift://<username>:<password>@<hostname>:5439/<database>`                                                                       |  💛    |
 | Presto        | `presto://<username>:<password>@<hostname>:8080/<database>`                                                                         |  💛    |
+<<<<<<< HEAD
 | Databricks    | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>`                                                      |  💛    |
 | ElasticSearch |                                                                                                                                     |  📝    |                                                        |  📝    |
+=======
+| Trino         | `trino://<username>:<password>@<hostname>:8080/<database>`                                                                          |  💛    |
+| ElasticSearch |                                                                                                                                     |  📝    |
+| Databricks    |                                                                                                                                     |  📝    |
+>>>>>>> 0d3fd47 (Add Trino connector)
 | Planetscale   |                                                                                                                                     |  📝    |
 | Clickhouse    |                                                                                                                                     |  📝    |
 | Pinot         |                                                                                                                                     |  📝    |
@@ -505,7 +511,7 @@ Now you can insert it into the testing database(s):
 ```shell-session
 # It's optional to seed more than one to run data-diff(1) against.
 $ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql
-$ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres
+$ poetry run preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres
 
 # Cloud databases
 $ poetry run preql -f dev/prepare_db.pql snowflake://<uri>
diff --git a/data_diff/databases/connect.py b/data_diff/databases/connect.py
@@ -13,6 +13,7 @@
 from .redshift import Redshift
 from .presto import Presto
 from .databricks import Databricks
+from .trino import Trino
 
 
 @dataclass
@@ -80,7 +81,8 @@ def match_path(self, dsn):
     "bigquery": MatchUriPath(BigQuery, ["dataset"], help_str="bigquery://<project>/<dataset>"),
     "databricks": MatchUriPath(
         Databricks, ["catalog", "schema"], help_str="databricks://:access_token@server_name/http_path",
-    )
+    ),
+    "trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
 }
 
 
@@ -105,6 +107,7 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
     - redshift
     - presto
     - databricks
+    - trino
     """
 
     dsn = dsnparse.parse(db_uri)
diff --git a/data_diff/databases/trino.py b/data_diff/databases/trino.py
@@ -0,0 +1,121 @@
+import re
+
+from .database_types import *
+from .base import Database, import_helper
+from .base import (
+    MD5_HEXDIGITS,
+    CHECKSUM_HEXDIGITS,
+    TIMESTAMP_PRECISION_POS,
+    DEFAULT_DATETIME_PRECISION,
+)
+
+
+@import_helper("trino")
+def import_trino():
+    import trino
+
+    return trino
+
+
+class Trino(Database):
+    default_schema = "public"
+    TYPE_CLASSES = {
+        # Timestamps
+        "timestamp with time zone": TimestampTZ,
+        "timestamp without time zone": Timestamp,
+        "timestamp": Timestamp,
+        # Numbers
+        "integer": Integer,
+        "bigint": Integer,
+        "real": Float,
+        "double": Float,
+        # Text
+        "varchar": Text,
+    }
+    ROUNDS_ON_PREC_LOSS = True
+
+    def __init__(self, **kw):
+        trino = import_trino()
+
+        self._conn = trino.dbapi.connect(**kw)
+
+    def quote(self, s: str):
+        return f'"{s}"'
+
+    def md5_to_int(self, s: str) -> str:
+        return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))"
+
+    def to_string(self, s: str):
+        return f"cast({s} as varchar)"
+
+    def _query(self, sql_code: str) -> list:
+        """Uses the standard SQL cursor interface"""
+        c = self._conn.cursor()
+        c.execute(sql_code)
+        if sql_code.lower().startswith("select"):
+            return c.fetchall()
+
+    def close(self):
+        self._conn.close()
+
+    def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
+        if coltype.rounds:
+            s = f"date_format(cast({coltype.precision} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
+        else:
+            s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
+
+        return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS + coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS + 6}, '0')"
+
+    def normalize_number(self, value: str, coltype: FractionalType) -> str:
+        return self.to_string(f"cast({value} as decimal(38,{coltype.precision}))")
+
+    def select_table_schema(self, path: DbPath) -> str:
+        schema, table = self._normalize_table_path(path)
+
+        return (
+            f"SELECT column_name, data_type, 3 as datetime_precision, 3 as numeric_precision FROM INFORMATION_SCHEMA.COLUMNS "
+            f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
+        )
+
+    def _parse_type(
+        self,
+        table_path: DbPath,
+        col_name: str,
+        type_repr: str,
+        datetime_precision: int = None,
+        numeric_precision: int = None,
+    ) -> ColType:
+        timestamp_regexps = {
+            r"timestamp\((\d)\)": Timestamp,
+            r"timestamp\((\d)\) with time zone": TimestampTZ,
+        }
+        for regexp, t_cls in timestamp_regexps.items():
+            m = re.match(regexp + "$", type_repr)
+            if m:
+                datetime_precision = int(m.group(1))
+                return t_cls(
+                    precision=datetime_precision
+                    if datetime_precision is not None
+                    else DEFAULT_DATETIME_PRECISION,
+                    rounds=self.ROUNDS_ON_PREC_LOSS,
+                )
+
+        number_regexps = {r"decimal\((\d+),(\d+)\)": Decimal}
+        for regexp, n_cls in number_regexps.items():
+            m = re.match(regexp + "$", type_repr)
+            if m:
+                prec, scale = map(int, m.groups())
+                return n_cls(scale)
+
+        string_regexps = {r"varchar\((\d+)\)": Text, r"char\((\d+)\)": Text}
+        for regexp, n_cls in string_regexps.items():
+            m = re.match(regexp + "$", type_repr)
+            if m:
+                return n_cls()
+
+        return super()._parse_type(
+            table_path, col_name, type_repr, datetime_precision, numeric_precision
+        )
+
+    def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
+        return f"TRIM({value})"
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -274,7 +274,7 @@ class TableDiffer:
     The algorithm uses hashing to quickly check if the tables are different, and then applies a
     bisection search recursively to find the differences efficiently.
 
-    Works best for comparing tables that are mostly the name, with minor discrepencies.
+    Works best for comparing tables that are mostly the same, with minor discrepencies.
 
     Parameters:
         bisection_factor (int): Into how many segments to bisect per iteration.
diff --git a/debug.py b/debug.py
diff --git a/dev/trino-conf/etc/catalog/memory.properties b/dev/trino-conf/etc/catalog/memory.properties
@@ -0,0 +1,2 @@
+connector.name=memory
+memory.max-data-per-node=128MB
diff --git a/dev/trino-conf/etc/catalog/postgresql.properties b/dev/trino-conf/etc/catalog/postgresql.properties
@@ -0,0 +1,4 @@
+connector.name=postgresql
+connection-url=jdbc:postgresql://postgres:5432/postgres
+connection-user=postgres
+connection-password=Password1
diff --git a/dev/trino-conf/etc/catalog/tpcds.properties b/dev/trino-conf/etc/catalog/tpcds.properties
@@ -0,0 +1 @@
+connector.name=tpcds
diff --git a/dev/trino-conf/etc/catalog/tpch.properties b/dev/trino-conf/etc/catalog/tpch.properties
@@ -0,0 +1 @@
+connector.name=tpch
diff --git a/dev/trino-conf/etc/config.properties b/dev/trino-conf/etc/config.properties
@@ -0,0 +1,5 @@
+coordinator=true
+node-scheduler.include-coordinator=true
+http-server.http.port=8080
+discovery.uri=http://localhost:8080
+discovery-server.enabled=true
diff --git a/dev/trino-conf/etc/jvm.config b/dev/trino-conf/etc/jvm.config
@@ -0,0 +1,12 @@
+-server
+-Xmx1G
+-XX:-UseBiasedLocking
+-XX:+UseG1GC
+-XX:G1HeapRegionSize=32M
+-XX:+ExplicitGCInvokesConcurrent
+-XX:+HeapDumpOnOutOfMemoryError
+-XX:+UseGCOverheadLimit
+-XX:+ExitOnOutOfMemoryError
+-XX:ReservedCodeCacheSize=256M
+-Djdk.attach.allowAttachSelf=true
+-Djdk.nio.maxCachedBufferSize=2000000
diff --git a/dev/trino-conf/etc/node.properties b/dev/trino-conf/etc/node.properties
@@ -0,0 +1,3 @@
+node.environment=docker
+node.data-dir=/data/trino
+plugin.dir=/usr/lib/trino/plugin
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -64,6 +64,16 @@ services:
         networks:
             - local
 
+    trino:
+        image: 'trinodb/trino:latest'
+        hostname: trino
+        ports:
+            - '8080:8080'
+        volumes:
+            - ./dev/trino-conf/etc:/etc/trino
+        networks:
+            - local
+
 volumes:
   postgresql-data:
   mysql-data:
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,6 +46,7 @@ psycopg2 = "*"
 presto-python-client = "*"
 parameterized = "*"
 unittest-parallel = "*"
+trino = "*"
 
 [tool.poetry.extras]
 # When adding, update also: README + dev deps just above
@@ -55,7 +56,11 @@ postgresql = ["psycopg2"]
 snowflake = ["snowflake-connector-python"]
 presto = ["presto-python-client"]
 oracle = ["cx_Oracle"]
+<<<<<<< HEAD
 databricks = ["databricks-sql-connector"]
+=======
+trino = ["trino"]
+>>>>>>> 0d3fd47 (Add Trino connector)
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+connector.name=memory`
	`2`	`+memory.max-data-per-node=128MB`