Skip to content
This repository was archived by the owner on Sep 23, 2024. It is now read-only.

Commit 34e141a

Browse files
authored
[AP-741] send new schema message when wal payload has new columns - LOG_BASED (#59)
* rebuild schema and send schema singer message when wal message s has new columns * add step to check dependency conflict
1 parent fe5493b commit 34e141a

File tree

4 files changed

+189
-47
lines changed

4 files changed

+189
-47
lines changed

.circleci/config.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,15 @@ jobs:
3232
command: |
3333
python3 -m venv ./virtualenvs/tap-postgres
3434
. ./virtualenvs/tap-postgres/bin/activate
35-
pip install --upgrade pip
35+
pip install --upgrade pip setuptools
3636
pip install -e .[test]
3737
38+
- run:
39+
name: Check dependencies for conflict
40+
command: |
41+
. ./virtualenvs/tap-postgres/bin/activate
42+
pip check && echo "No conflicts" || exit 1
43+
3844
- run:
3945
name: 'Pylinting'
4046
command: |

tap_postgres/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ def sync_logical_streams(conn_config, logical_streams, state, end_lsn, state_fil
201201
LOGGER.info("Pure Logical Replication upto lsn %s for (%s)", end_lsn,
202202
[s['tap_stream_id'] for s in logical_streams])
203203

204-
logical_streams = [logical_replication.add_automatic_properties(s, conn_config) for s in logical_streams]
204+
logical_streams = [logical_replication.add_automatic_properties(
205+
s, conn_config.get('debug_lsn', False)) for s in logical_streams]
205206

206207
# Remove LOG_BASED stream bookmarks from state if it has been de-selected
207208
# This is to avoid sending very old starting and flushing positions to source

tap_postgres/sync_strategies/logical_replication.py

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,19 @@
22
import pytz
33
import decimal
44
import psycopg2
5-
from psycopg2 import sql
65
import copy
76
import json
87
import singer
98
import singer.metadata as metadata
9+
10+
from psycopg2 import sql
1011
from singer import utils, get_bookmark
1112
from dateutil.parser import parse
1213
from functools import reduce
1314

1415
import tap_postgres.db as post_db
1516
import tap_postgres.sync_strategies.common as sync_common
17+
from tap_postgres.stream_utils import refresh_streams_schema
1618

1719
LOGGER = singer.get_logger('tap_postgres')
1820

@@ -22,6 +24,9 @@
2224
class ReplicationSlotNotFoundError(Exception):
2325
"""Custom exception when replication slot not found"""
2426

27+
class UnsupportedPayloadKindError(Exception):
28+
"""Custom exception when waljson payload is not insert, update nor delete"""
29+
2530

2631
# pylint: disable=invalid-name,missing-function-docstring,too-many-branches,too-many-statements,too-many-arguments
2732
def get_pg_version(conn_info):
@@ -98,9 +103,10 @@ def fetch_current_lsn(conn_config):
98103
return lsn_to_int(current_lsn)
99104

100105

101-
def add_automatic_properties(stream, conn_config):
106+
def add_automatic_properties(stream, debug_lsn: bool = False):
102107
stream['schema']['properties']['_sdc_deleted_at'] = {'type': ['null', 'string'], 'format': 'date-time'}
103-
if conn_config.get('debug_lsn'):
108+
109+
if debug_lsn:
104110
LOGGER.debug('debug_lsn is ON')
105111
stream['schema']['properties']['_sdc_lsn'] = {'type': ['null', 'string']}
106112
else:
@@ -295,7 +301,7 @@ def row_to_singer_message(stream, row, version, columns, time_extracted, md_map,
295301

296302

297303
# pylint: disable=unused-argument,too-many-locals
298-
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
304+
def consume_message(streams, state, msg, time_extracted, conn_info):
299305
# Strip leading comma generated by write-in-chunks and parse valid JSON
300306
try:
301307
payload = json.loads(msg.payload.lstrip(','))
@@ -304,56 +310,58 @@ def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
304310

305311
lsn = msg.data_start
306312

307-
streams_lookup = {}
308-
for s in streams:
309-
streams_lookup[s['tap_stream_id']] = s
313+
streams_lookup = {s['tap_stream_id']: s for s in streams}
310314

311315
tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table'])
312316
if streams_lookup.get(tap_stream_id) is None:
313317
return state
314318

315319
target_stream = streams_lookup[tap_stream_id]
320+
321+
if payload['kind'] not in {'insert', 'update', 'delete'}:
322+
raise UnsupportedPayloadKindError("unrecognized replication operation: {}".format(payload['kind']))
323+
324+
# Get the additional fields in payload that are not in schema properties:
325+
# only inserts and updates have the list of columns that can be used to detect any different in columns
326+
diff = set()
327+
if payload['kind'] in {'insert', 'update'}:
328+
diff = set(payload['columnnames']).difference(target_stream['schema']['properties'].keys())
329+
330+
# if there is new columns in the payload that are not in the schema properties then refresh the stream schema
331+
if diff:
332+
LOGGER.info('Detected new columns "%s", refreshing schema of stream %s', diff, target_stream['stream'])
333+
# encountered a column that is not in the schema
334+
# refresh the stream schema and metadata by running discovery
335+
refresh_streams_schema(conn_info, [target_stream])
336+
337+
# add the automatic properties back to the stream
338+
add_automatic_properties(target_stream, conn_info.get('debug_lsn', False))
339+
340+
# publish new schema
341+
sync_common.send_schema_message(target_stream, ['lsn'])
342+
316343
stream_version = get_stream_version(target_stream['tap_stream_id'], state)
317344
stream_md_map = metadata.to_map(target_stream['metadata'])
318345

319-
desired_columns = [c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(
320-
stream_md_map, c)]
346+
desired_columns = {c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(
347+
stream_md_map, c)}
321348

322-
if payload['kind'] == 'insert':
349+
if payload['kind'] in {'insert', 'update'}:
323350
col_names = []
324351
col_vals = []
325-
for idx, col in enumerate(payload['columnnames']):
326-
if col in set(desired_columns):
327-
col_names.append(col)
328-
col_vals.append(payload['columnvalues'][idx])
329352

330-
col_names = col_names + ['_sdc_deleted_at']
331-
col_vals = col_vals + [None]
332-
if conn_info.get('debug_lsn'):
333-
col_names = col_names + ['_sdc_lsn']
334-
col_vals = col_vals + [str(lsn)]
335-
record_message = row_to_singer_message(target_stream,
336-
col_vals,
337-
stream_version,
338-
col_names,
339-
time_extracted,
340-
stream_md_map,
341-
conn_info)
342-
343-
elif payload['kind'] == 'update':
344-
col_names = []
345-
col_vals = []
346353
for idx, col in enumerate(payload['columnnames']):
347-
if col in set(desired_columns):
354+
if col in desired_columns:
348355
col_names.append(col)
349356
col_vals.append(payload['columnvalues'][idx])
350357

351-
col_names = col_names + ['_sdc_deleted_at']
352-
col_vals = col_vals + [None]
358+
col_names.append('_sdc_deleted_at')
359+
col_vals.append(None)
353360

354361
if conn_info.get('debug_lsn'):
355-
col_vals = col_vals + [str(lsn)]
356-
col_names = col_names + ['_sdc_lsn']
362+
col_names.append('_sdc_lsn')
363+
col_vals.append(str(lsn))
364+
357365
record_message = row_to_singer_message(target_stream,
358366
col_vals,
359367
stream_version,
@@ -366,15 +374,17 @@ def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
366374
col_names = []
367375
col_vals = []
368376
for idx, col in enumerate(payload['oldkeys']['keynames']):
369-
if col in set(desired_columns):
377+
if col in desired_columns:
370378
col_names.append(col)
371379
col_vals.append(payload['oldkeys']['keyvalues'][idx])
372380

373-
col_names = col_names + ['_sdc_deleted_at']
374-
col_vals = col_vals + [singer.utils.strftime(time_extracted)]
381+
col_names.append('_sdc_deleted_at')
382+
col_vals.append(singer.utils.strftime(time_extracted))
383+
375384
if conn_info.get('debug_lsn'):
376-
col_vals = col_vals + [str(lsn)]
377-
col_names = col_names + ['_sdc_lsn']
385+
col_names.append('_sdc_lsn')
386+
col_vals.append(str(lsn))
387+
378388
record_message = row_to_singer_message(target_stream,
379389
col_vals,
380390
stream_version,
@@ -383,9 +393,6 @@ def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
383393
stream_md_map,
384394
conn_info)
385395

386-
else:
387-
raise Exception("unrecognized replication operation: {}".format(payload['kind']))
388-
389396
singer.write_message(record_message)
390397
state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn)
391398

@@ -544,7 +551,7 @@ def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
544551
LOGGER.info('Breaking - reached max_run_seconds of %i', max_run_seconds)
545552
break
546553

547-
state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn)
554+
state = consume_message(logical_streams, state, msg, time_extracted, conn_info)
548555

549556
# When using wal2json with write-in-chunks, multiple messages can have the same lsn
550557
# This is to ensure we only flush to lsn that has completed entirely

tests/test_logical_replication.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
import json
12
import unittest
23

4+
from collections import namedtuple
5+
from unittest.mock import patch
6+
37
from tap_postgres.sync_strategies import logical_replication
8+
from tap_postgres.sync_strategies.logical_replication import UnsupportedPayloadKindError
49

510

611
class PostgresCurReplicationSlotMock:
@@ -31,7 +36,7 @@ class TestLogicalReplication(unittest.TestCase):
3136
maxDiff = None
3237

3338
def setUp(self):
34-
pass
39+
self.WalMessage = namedtuple('WalMessage', ['payload', 'data_start'])
3540

3641
def test_streams_to_wal2json_tables(self):
3742
"""Validate if table names are escaped to wal2json format"""
@@ -115,3 +120,126 @@ def test_locate_replication_slot_by_cur(self):
115120
'some_db',
116121
'some_tap'),
117122
'pipelinewise_some_db_some_tap')
123+
124+
def test_consume_with_message_payload_is_not_json_expect_same_state(self):
125+
126+
output = logical_replication.consume_message([],
127+
{},
128+
self.WalMessage(payload='this is an invalid json message', data_start=None),
129+
None,
130+
{}
131+
)
132+
self.assertDictEqual({}, output)
133+
134+
def test_consume_with_message_stream_in_payload_is_not_selected_expect_same_state(self):
135+
output = logical_replication.consume_message(
136+
[{'tap_stream_id': 'myschema-mytable'}],
137+
{},
138+
self.WalMessage(payload='{"schema": "myschema", "table": "notmytable"}',
139+
data_start='some lsn'),
140+
None,
141+
{}
142+
)
143+
144+
self.assertDictEqual({}, output)
145+
146+
def test_consume_with_payload_kind_is_not_supported_expect_exception(self):
147+
148+
with self.assertRaises(UnsupportedPayloadKindError):
149+
logical_replication.consume_message(
150+
[{'tap_stream_id': 'myschema-mytable'}],
151+
{},
152+
self.WalMessage(payload='{"kind":"truncate", "schema": "myschema", "table": "mytable"}',
153+
data_start='some lsn'),
154+
None,
155+
{}
156+
)
157+
@patch('tap_postgres.logical_replication.singer.write_message')
158+
@patch('tap_postgres.logical_replication.sync_common.send_schema_message')
159+
@patch('tap_postgres.logical_replication.refresh_streams_schema')
160+
def test_consume_message_with_new_column_in_payload_will_refresh_schema(self,
161+
refresh_schema_mock,
162+
send_schema_mock,
163+
write_message_mock):
164+
streams = [
165+
{
166+
'tap_stream_id': 'myschema-mytable',
167+
'stream': 'mytable',
168+
'schema': {
169+
'properties': {
170+
'id': {},
171+
'date_created': {}
172+
}
173+
},
174+
'metadata': [
175+
{
176+
'breadcrumb': [],
177+
'metadata': {
178+
'is-view': False,
179+
'table-key-properties': ['id'],
180+
'schema-name': 'myschema'
181+
}
182+
},
183+
{
184+
"breadcrumb": [
185+
"properties",
186+
"id"
187+
],
188+
"metadata": {
189+
"sql-datatype": "integer",
190+
"inclusion": "automatic",
191+
}
192+
},
193+
{
194+
"breadcrumb": [
195+
"properties",
196+
"date_created"
197+
],
198+
"metadata": {
199+
"sql-datatype": "datetime",
200+
"inclusion": "available",
201+
"selected": True
202+
}
203+
}
204+
],
205+
}
206+
]
207+
208+
return_v = logical_replication.consume_message(
209+
streams,
210+
{
211+
'bookmarks': {
212+
"myschema-mytable": {
213+
"last_replication_method": "LOG_BASED",
214+
"lsn": None,
215+
"version": 1000,
216+
"xmin": None
217+
}
218+
}
219+
},
220+
self.WalMessage(payload='{"kind": "insert", '
221+
'"schema": "myschema", '
222+
'"table": "mytable",'
223+
'"columnnames": ["id", "date_created", "new_col"],'
224+
'"columnnames": [1, null, "some random text"]'
225+
'}',
226+
data_start='some lsn'),
227+
None,
228+
{}
229+
)
230+
231+
self.assertDictEqual(return_v,
232+
{
233+
'bookmarks': {
234+
"myschema-mytable": {
235+
"last_replication_method": "LOG_BASED",
236+
"lsn": "some lsn",
237+
"version": 1000,
238+
"xmin": None
239+
}
240+
}
241+
})
242+
243+
refresh_schema_mock.assert_called_once_with({}, [streams[0]])
244+
send_schema_mock.assert_called_once()
245+
write_message_mock.assert_called_once()

0 commit comments

Comments
 (0)