Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit bcb76e2

Browse files
authored
Merge pull request #163 from datafold/danthelion-trino-connector
Add support for Trino (Merge PR #155)
2 parents f657f47 + 00103b2 commit bcb76e2

21 files changed

+247
-198
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
python-version: ${{ matrix.python-version }}
3535

3636
- name: Build the stack
37-
run: docker-compose up -d mysql postgres presto
37+
run: docker-compose up -d mysql postgres presto trino
3838

3939
- name: Install Poetry
4040
run: pip install poetry
@@ -46,4 +46,5 @@ jobs:
4646
env:
4747
DATADIFF_SNOWFLAKE_URI: '${{ secrets.DATADIFF_SNOWFLAKE_URI }}'
4848
DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
49+
DATADIFF_TRINO_URI: '${{ secrets.DATADIFF_TRINO_URI }}'
4950
run: poetry run unittest-parallel -j 16

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,6 @@ benchmark_*.png
138138

139139
# Mac
140140
.DS_Store
141+
142+
# IntelliJ
143+
.idea

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ $ data-diff \
126126
| Redshift | `redshift://<username>:<password>@<hostname>:5439/<database>` | 💛 |
127127
| Presto | `presto://<username>:<password>@<hostname>:8080/<database>` | 💛 |
128128
| Databricks | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` | 💛 |
129-
| ElasticSearch | | 📝 | | 📝 |
129+
| Trino | `trino://<username>:<password>@<hostname>:8080/<database>` | 💛 |
130+
| ElasticSearch | | 📝 |
131+
| Databricks | | 📝 |
130132
| Planetscale | | 📝 |
131133
| Clickhouse | | 📝 |
132134
| Pinot | | 📝 |
@@ -163,6 +165,8 @@ While you may install them manually, we offer an easy way to install them along
163165

164166
- `pip install 'data-diff[oracle]'`
165167

168+
- `pip install 'data-diff[trino]'`
169+
166170
- For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/
167171

168172

@@ -505,7 +509,7 @@ Now you can insert it into the testing database(s):
505509
```shell-session
506510
# It's optional to seed more than one to run data-diff(1) against.
507511
$ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql
508-
$ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres
512+
$ poetry run preql -f dev/prepare_db.pql postgres://postgres:Password1@127.0.0.1:5432/postgres
509513
510514
# Cloud databases
511515
$ poetry run preql -f dev/prepare_db.pql snowflake://<uri>

data_diff/databases/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@
88
from .redshift import Redshift
99
from .presto import Presto
1010
from .databricks import Databricks
11+
from .trino import Trino
1112

1213
from .connect import connect_to_uri

data_diff/databases/connect.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .redshift import Redshift
1414
from .presto import Presto
1515
from .databricks import Databricks
16+
from .trino import Trino
1617

1718

1819
@dataclass
@@ -80,7 +81,8 @@ def match_path(self, dsn):
8081
"bigquery": MatchUriPath(BigQuery, ["dataset"], help_str="bigquery://<project>/<dataset>"),
8182
"databricks": MatchUriPath(
8283
Databricks, ["catalog", "schema"], help_str="databricks://:access_token@server_name/http_path",
83-
)
84+
),
85+
"trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
8486
}
8587

8688

@@ -105,6 +107,7 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
105107
- redshift
106108
- presto
107109
- databricks
110+
- trino
108111
"""
109112

110113
dsn = dsnparse.parse(db_uri)

data_diff/databases/trino.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import re
2+
3+
from .database_types import *
4+
from .base import Database, import_helper
5+
from .base import (
6+
MD5_HEXDIGITS,
7+
CHECKSUM_HEXDIGITS,
8+
TIMESTAMP_PRECISION_POS,
9+
DEFAULT_DATETIME_PRECISION,
10+
)
11+
12+
13+
@import_helper("trino")
14+
def import_trino():
15+
import trino
16+
17+
return trino
18+
19+
20+
class Trino(Database):
21+
default_schema = "public"
22+
TYPE_CLASSES = {
23+
# Timestamps
24+
"timestamp with time zone": TimestampTZ,
25+
"timestamp without time zone": Timestamp,
26+
"timestamp": Timestamp,
27+
# Numbers
28+
"integer": Integer,
29+
"bigint": Integer,
30+
"real": Float,
31+
"double": Float,
32+
# Text
33+
"varchar": Text,
34+
}
35+
ROUNDS_ON_PREC_LOSS = True
36+
37+
def __init__(self, **kw):
38+
trino = import_trino()
39+
40+
self._conn = trino.dbapi.connect(**kw)
41+
42+
def quote(self, s: str):
43+
return f'"{s}"'
44+
45+
def md5_to_int(self, s: str) -> str:
46+
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))"
47+
48+
def to_string(self, s: str):
49+
return f"cast({s} as varchar)"
50+
51+
def _query(self, sql_code: str) -> list:
52+
"""Uses the standard SQL cursor interface"""
53+
c = self._conn.cursor()
54+
c.execute(sql_code)
55+
if sql_code.lower().startswith("select"):
56+
return c.fetchall()
57+
if re.match(r"(insert|create|truncate|drop)", sql_code, re.IGNORECASE):
58+
return c.fetchone()
59+
60+
def close(self):
61+
self._conn.close()
62+
63+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
64+
if coltype.rounds:
65+
s = f"date_format(cast({value} as timestamp({coltype.precision})), '%Y-%m-%d %H:%i:%S.%f')"
66+
else:
67+
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
68+
69+
return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS + coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS + 6}, '0')"
70+
71+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
72+
return self.to_string(f"cast({value} as decimal(38,{coltype.precision}))")
73+
74+
def select_table_schema(self, path: DbPath) -> str:
75+
schema, table = self._normalize_table_path(path)
76+
77+
return (
78+
f"SELECT column_name, data_type, 3 as datetime_precision, 3 as numeric_precision FROM INFORMATION_SCHEMA.COLUMNS "
79+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
80+
)
81+
82+
def _parse_type(
83+
self,
84+
table_path: DbPath,
85+
col_name: str,
86+
type_repr: str,
87+
datetime_precision: int = None,
88+
numeric_precision: int = None,
89+
) -> ColType:
90+
timestamp_regexps = {
91+
r"timestamp\((\d)\)": Timestamp,
92+
r"timestamp\((\d)\) with time zone": TimestampTZ,
93+
}
94+
for regexp, t_cls in timestamp_regexps.items():
95+
m = re.match(regexp + "$", type_repr)
96+
if m:
97+
datetime_precision = int(m.group(1))
98+
return t_cls(
99+
precision=datetime_precision
100+
if datetime_precision is not None
101+
else DEFAULT_DATETIME_PRECISION,
102+
rounds=self.ROUNDS_ON_PREC_LOSS,
103+
)
104+
105+
number_regexps = {r"decimal\((\d+),(\d+)\)": Decimal}
106+
for regexp, n_cls in number_regexps.items():
107+
m = re.match(regexp + "$", type_repr)
108+
if m:
109+
prec, scale = map(int, m.groups())
110+
return n_cls(scale)
111+
112+
string_regexps = {r"varchar\((\d+)\)": Text, r"char\((\d+)\)": Text}
113+
for regexp, n_cls in string_regexps.items():
114+
m = re.match(regexp + "$", type_repr)
115+
if m:
116+
return n_cls()
117+
118+
return super()._parse_type(
119+
table_path, col_name, type_repr, datetime_precision, numeric_precision
120+
)
121+
122+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
123+
return f"TRIM({value})"

data_diff/diff_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ class TableDiffer:
274274
The algorithm uses hashing to quickly check if the tables are different, and then applies a
275275
bisection search recursively to find the differences efficiently.
276276
277-
Works best for comparing tables that are mostly the name, with minor discrepencies.
277+
Works best for comparing tables that are mostly the same, with minor discrepencies.
278278
279279
Parameters:
280280
bisection_factor (int): Into how many segments to bisect per iteration.

debug.py

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
connector.name=jmx
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
connector.name=memory
2+
memory.max-data-per-node=128MB

0 commit comments

Comments
 (0)