From 7296096b638bd6221f18b3c85621bd2fa3721b57 Mon Sep 17 00:00:00 2001 From: clohl Date: Thu, 1 May 2025 17:30:21 -0700 Subject: [PATCH 1/3] ci: update project files --- .gitignore | 4 +++- .vscode/settings.json | 7 +++++++ pyproject.toml | 5 ++++- requirements.txt | 3 ++- 4 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index aa54c11..efede9f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ dist venv .spyproject .idea -site \ No newline at end of file +site +.env +uv.lock diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9b38853 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 6360f7f..71b8729 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "xml2db" -version = "0.12.5" +version = "0.12.6" authors = [ { name="Commission de régulation de l'énergie", email="opensource@cre.fr" }, ] @@ -36,3 +36,6 @@ markers = [ "dbtest: marks tests as integration tests requiring a database backend (deselect with '-m \"not dbtest\"')", ] junit_family = "xunit2" + +[tool.uv] +package = true diff --git a/requirements.txt b/requirements.txt index 3a897ee..b078d5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ paginate==0.5.7 pathspec==0.12.1 platformdirs==4.3.7 pluggy==1.5.0 -psycopg2==2.9.10 +psycopg2-binary==2.9.10 Pygments==2.19.1 pymdown-extensions==10.14.3 PyMySQL==1.1.1 @@ -42,6 +42,7 @@ requests==2.32.3 six==1.17.0 SQLAlchemy==2.0.40 typing_extensions==4.13.1 +tzlocal==5.3.1 urllib3==2.3.0 watchdog==6.0.0 xmlschema==3.4.5 From ab7789d0303912c922654da9995f9431a2a95823 Mon Sep 17 00:00:00 2001 From: clohl Date: Thu, 1 May 2025 17:30:03 -0700 Subject: [PATCH 2/3] feat: allow name shortening options, handle timezones in tests refactor: adds shortening overrides, fixes tests --- src/xml2db/model.py | 4 ++ src/xml2db/table/duplicated_table.py | 5 +- src/xml2db/table/relations.py | 8 ++- src/xml2db/table/reused_table.py | 7 +- src/xml2db/table/table.py | 62 ++++++++++++++++- src/xml2db/table/transformed_table.py | 1 - tests/conftest.py | 6 +- tests/test_roundtrip.py | 96 ++++++++++++++++++++++++--- 8 files changed, 169 insertions(+), 20 deletions(-) diff --git a/src/xml2db/model.py b/src/xml2db/model.py index fd666d4..1d94c4d 100644 --- a/src/xml2db/model.py +++ b/src/xml2db/model.py @@ -143,6 +143,8 @@ def _validate_config(self, cfg): for key, exp_type, default in [ ("as_columnstore", bool, False), ("row_numbers", bool, False), + ("shorten_temp_table_names", bool, False), + ("shorten_rel_table_names", bool, False), ("document_tree_hook", callable, None), ("document_tree_node_hook", callable, None), ("record_hash_column_name", str, "xml2db_record_hash"), @@ -194,6 +196,8 @@ def _create_table_model( A data model instance. """ table_config = self.tables_config.get(table_name, {}) + table_config["shorten_temp_table_names"] = self.model_config["shorten_temp_table_names"] + table_config["shorten_rel_table_names"] = self.model_config["shorten_rel_table_names"] if table_config.get("reuse", True): return DataModelTableReused( table_name, diff --git a/src/xml2db/table/duplicated_table.py b/src/xml2db/table/duplicated_table.py index 72a5b03..5a52ba8 100644 --- a/src/xml2db/table/duplicated_table.py +++ b/src/xml2db/table/duplicated_table.py @@ -119,9 +119,12 @@ def get_col(temp=False) -> Iterable[Column]: ) ) + temp_table_name = f"{prefix}{self.name}" + temp_table_name = self.truncate_long_name(temp_table_name) if self.config.get("shorten_temp_table_names") else temp_table_name + # build temporary table self.temp_table = Table( - f"{prefix}{self.name}", + temp_table_name, self.metadata, Column(f"pk_{self.name}", Integer), *get_col(temp=True), diff --git a/src/xml2db/table/relations.py b/src/xml2db/table/relations.py index 96245d1..235e792 100644 --- a/src/xml2db/table/relations.py +++ b/src/xml2db/table/relations.py @@ -101,8 +101,11 @@ def build_relation_tables(self) -> None: ) prefix = f"temp_{self.table.temp_prefix}_" if self.other_table.is_reused: + temp_table_name = f"{prefix}{self.rel_table_name}" + temp_table_name = self.table.truncate_long_name(temp_table_name) if self.table.config.get("shorten_temp_table_names") else temp_table_name + self.temp_rel_table = Table( - f"{prefix}{self.rel_table_name}", + temp_table_name, self.table.metadata, Column(f"temp_fk_{self.table.name}", Integer, nullable=False), Column(f"fk_{self.table.name}", Integer), @@ -132,8 +135,9 @@ def build_relation_tables(self) -> None: ), ) + table_name = self.table.truncate_long_name(self.rel_table_name) if self.table.config.get("shorten_rel_table_names") else self.rel_table_name self.rel_table = Table( - self.rel_table_name, + table_name, self.table.metadata, Column( f"fk_{self.table.name}", diff --git a/src/xml2db/table/reused_table.py b/src/xml2db/table/reused_table.py index 3c09851..c13ff77 100644 --- a/src/xml2db/table/reused_table.py +++ b/src/xml2db/table/reused_table.py @@ -15,14 +15,12 @@ from .column import DataModelColumn from .transformed_table import DataModelTableTransformed - def shorten_str(x: str, max_len: int = 30) -> str: if len(x) > max_len: h = sha1(x.encode("utf8")) return f"{x[:(max_len - 7)]}_{h.hexdigest()[1:6]}" return x - class DataModelTableReused(DataModelTableTransformed): """A table data model which de-duplicates records in the database based on their hash value. @@ -134,9 +132,12 @@ def get_col(temp=False): ) ) + temp_table_name = f"{prefix}{self.name}" + temp_table_name = self.truncate_long_name(temp_table_name) if self.config.get("shorten_temp_table_names") else temp_table_name + # build temporary table self.temp_table = Table( - f"{prefix}{self.name}", + temp_table_name, self.metadata, Column(f"pk_{self.name}", Integer), Column( diff --git a/src/xml2db/table/table.py b/src/xml2db/table/table.py index 6e11a24..f4517f7 100644 --- a/src/xml2db/table/table.py +++ b/src/xml2db/table/table.py @@ -1,3 +1,5 @@ +import hashlib +import base64 from typing import Iterable, List, Any, Union, TYPE_CHECKING import logging import sqlalchemy @@ -13,7 +15,6 @@ logger = logging.getLogger(__name__) - class DataModelTable: """A class representing a database table translated from an XML schema complex type @@ -99,6 +100,8 @@ def _validate_config(self, cfg, db_type): config = { "reuse": check_type(cfg, "reuse", bool, True), "as_columnstore": check_type(cfg, "as_columnstore", bool, False), + "shorten_temp_table_names": check_type(cfg, "shorten_temp_table_names", bool, False), + "shorten_rel_table_names": check_type(cfg, "shorten_rel_table_names", bool, False) } if "extra_args" in cfg and not ( isinstance(cfg["extra_args"], list) @@ -117,7 +120,7 @@ def _validate_config(self, cfg, db_type): logger.warning( "Clustered columnstore indexes are only supported with MS SQL Server database" ) - + config["fields"] = cfg.get("fields", {}) return config @@ -324,10 +327,13 @@ def create_tables(self, engine: sqlalchemy.engine.base.Engine, temp: bool = Fals temp: if True, create temporary (prefixed) tables """ if temp: + logging.info(f"Creating temp table: {self.temp_table.name}") self.temp_table.create(engine, checkfirst=True) else: + logging.info(f"Creating table: {self.table.name}") self.table.create(engine, checkfirst=True) for relation in self.relations_n.values(): + logging.info(f"Creating relation: {relation.name}") relation.create_table(engine, temp) def get_insert_temp_records_statements( @@ -403,3 +409,55 @@ def get_entity_rel_diagram(self) -> List: + ["}"] ) return [f" {line}" for line in out] + + def truncate_long_name(self, table_name: str) -> str: + max_len = 63 #both postgres and mysql safe table name len + new_name = table_name + + short_name = "" + shorter_name = "" + is_tmp = "temp" in table_name + suffix = f"_{hashlib.md5(table_name.encode('utf-8')).hexdigest()}" + + if len(table_name) > max_len: + words = table_name.split("_") + + for word in words: + if len(short_name) + len(word)<= (max_len - 1): + if len(short_name) > 0: short_name += "_" + short_name += f"{word}" + if len(shorter_name) + len(word) <= (max_len - 10): + if len(shorter_name) > 0: shorter_name += "_" + shorter_name += f"{word}" + + #check if sliced name already exists: + sentinel = False + if is_tmp: + # just cut the name up and append the full suffix + # this doesn't need to be human readable / usable + short_name = short_name[:30] + sentinel = True + else: + for tbl in self.data_model.tables.values(): + if sentinel or tbl.name == short_name: + sentinel = True + break + for relation in tbl.relations_n.values(): + if relation.rel_table_name == short_name: + sentinel = True + break + + # an existing table or relation was found: append a + # random-ish suffix to help prevent name collisions + if sentinel: + # create a more useable/legible short table name + suffix = f"_{suffix[:8]}" + short_name = shorter_name + else: + # nothing was found so we can just run with the short name + suffix = "" + + # finalize the new shortened name + new_name = f"{short_name}{suffix}" + + return new_name diff --git a/src/xml2db/table/transformed_table.py b/src/xml2db/table/transformed_table.py index 031d858..c88f763 100644 --- a/src/xml2db/table/transformed_table.py +++ b/src/xml2db/table/transformed_table.py @@ -5,7 +5,6 @@ from .relations import DataModelRelation1, DataModelRelationN from .table import DataModelTable - class DataModelTableTransformed(DataModelTable): """A class extending DataModelTable with transformations diff --git a/tests/conftest.py b/tests/conftest.py index 343fecb..5ce6745 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,4 +37,8 @@ def setup_db_model(conn_string, model_config): yield model - model.drop_all_tables() + try: + model.drop_all_tables() + except Exception as e: + print(f"Unable to drop all tables: {e}") + pass diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index 8f63496..f307d82 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -7,6 +7,79 @@ from .conftest import list_xml_path from .sample_models import models +import re +import tzlocal +from datetime import datetime +from zoneinfo import ZoneInfo +from typing import Any, Tuple + +# Regex for ISO-like datetime with timezone (adjust as needed) +IANA_TZ = str(tzlocal.get_localzone()) +DATE_PATTERN = re.compile( + r'(\d{2}|\d{4})-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}\.\d{3}(?:[+-]\d{2}:\d{2}|Z)' +) +TIME_ZONE = os.environ.get("TIME_ZONE", IANA_TZ) + +def convert_date_string(date_str: str, to_tz: str) -> str: + # Normalize 'Z' to '+00:00' + if date_str.endswith('Z'): + date_str = date_str[:-1] + '+00:00' + + year = 0 + try: + dt = datetime.strptime(date_str, '%y-%m-%dT%H:%M:%S.%f%z') + year = 2 + except: + try: + dt = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f%z') + year = 4 + except: + pass + + formatted = date_str + if year > 0: + # Convert to your desired timezone (example: US/Mountain) + target_tz = ZoneInfo(to_tz) + dt_new = dt.astimezone(target_tz) + + # Format back to original string format + if year == 2: + formatted = dt_new.strftime('%y-%m-%dT%H:%M:%S.%f%z') + elif year == 4: + formatted = dt_new.strftime('%Y-%m-%dT%H:%M:%S.%f%z') + + # Insert colon in timezone offset for ISO 8601 compliance + formatted = formatted[:-2] + ':' + formatted[-2:] + # Truncate microseconds to milliseconds + dot_idx = formatted.find('.') + if dot_idx != -1: + formatted = formatted[:dot_idx+4] + formatted[dot_idx+7:] + + return formatted + +def update_dates_in_tuple(data: Tuple[str, Any], to_tz: str) -> Tuple[str, Any]: + def update_dates(obj: Any, to_tz: str) -> Any: + if isinstance(obj, dict): + return {k: update_dates(v, to_tz) for k, v in obj.items()} + elif isinstance(obj, list): + return [update_dates(item, to_tz) for item in obj] + elif isinstance(obj, tuple): + return tuple(update_dates(item, to_tz) for item in obj) + elif isinstance(obj, str) and DATE_PATTERN.match(obj): + try: + return convert_date_string(obj, to_tz) + except Exception: + return obj + else: + return obj + + key, nested_dict = data + updated_dict = update_dates(nested_dict, to_tz) + return (key, updated_dict) + +def convert(match, tz=TIME_ZONE): + date_str = match.group() + return convert_date_string(date_str=date_str, to_tz=tz) @pytest.mark.dbtest @pytest.mark.parametrize( @@ -26,7 +99,7 @@ def test_database_xml_roundtrip(setup_db_model, model_config): for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz="Europe/Paris" + f"input_file_path='{file}'", force_tz=TIME_ZONE ) with open(file, "rt") as f: @@ -43,7 +116,8 @@ def test_database_xml_roundtrip(setup_db_model, model_config): encoding="utf-8", xml_declaration=True, ).decode("utf-8") - + xml = re.sub(DATE_PATTERN, convert, xml) + ref_xml = re.sub(DATE_PATTERN, convert, ref_xml) assert xml == ref_xml @@ -65,16 +139,16 @@ def test_database_document_tree_roundtrip(setup_db_model, model_config): for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz="Europe/Paris" + f"input_file_path='{file}'", force_tz=TIME_ZONE ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file, file) - assert doc.flat_data_to_doc_tree() == remove_record_hash( + assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( converter.document_tree - ) + ), TIME_ZONE) @pytest.mark.dbtest @@ -102,16 +176,16 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz="Europe/Paris" + f"input_file_path='{file}'", force_tz=TIME_ZONE ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file, file) - assert doc.flat_data_to_doc_tree() == remove_record_hash( + assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( converter.document_tree - ) + ), TIME_ZONE) @pytest.mark.skip @@ -135,11 +209,13 @@ def test_database_single_document_tree_roundtrip(setup_db_model, model_config): doc.insert_into_target_tables() doc = model.extract_from_database( - f"input_file_path='{file_path}'", force_tz="Europe/Paris" + f"input_file_path='{file_path}'", force_tz=TIME_ZONE ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file_path, file_path) - assert doc.flat_data_to_doc_tree() == remove_record_hash(converter.document_tree) + assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( + converter.document_tree + ), TIME_ZONE) From c35ecc350314d08c554b002c8a108b4875dfa3e0 Mon Sep 17 00:00:00 2001 From: cre-os Date: Thu, 9 Oct 2025 14:14:31 +0200 Subject: [PATCH 3/3] wip --- .vscode/settings.json | 7 -- requirements.txt | 61 +++++++++-------- src/xml2db/model.py | 8 ++- src/xml2db/table/duplicated_table.py | 8 ++- src/xml2db/table/relations.py | 16 +++-- src/xml2db/table/reused_table.py | 10 ++- src/xml2db/table/table.py | 39 ++++++----- src/xml2db/table/transformed_table.py | 1 + tests/conftest.py | 6 +- tests/test_roundtrip.py | 96 +++------------------------ 10 files changed, 97 insertions(+), 155 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9b38853..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "python.testing.pytestArgs": [ - "tests" - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b078d5a..7d0d869 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,48 +1,47 @@ babel==2.17.0 -backrefs==5.8 -certifi==2025.1.31 -charset-normalizer==3.4.1 -click==8.1.8 +backrefs==5.9 +certifi==2025.10.5 +charset-normalizer==3.4.3 +click==8.3.0 colorama==0.4.6 -duckdb==1.2.1 +duckdb==1.4.1 duckdb_engine==0.17.0 -elementpath==4.8.0 +elementpath==5.0.4 ghp-import==2.1.0 -greenlet==3.1.1 -griffe==1.7.2 +greenlet==3.2.4 +griffe==1.14.0 idna==3.10 iniconfig==2.1.0 Jinja2==3.1.6 -lxml==5.3.2 -Markdown==3.7 -MarkupSafe==3.0.2 +lxml==6.0.2 +Markdown==3.9 +MarkupSafe==3.0.3 mergedeep==1.3.4 mkdocs==1.6.1 -mkdocs-autorefs==1.4.1 +mkdocs-autorefs==1.4.3 mkdocs-get-deps==0.2.0 -mkdocs-material==9.6.11 +mkdocs-material==9.6.21 mkdocs-material-extensions==1.3.1 -mkdocstrings==0.29.1 -mkdocstrings-python==1.16.10 -packaging==24.2 +mkdocstrings==0.30.1 +mkdocstrings-python==1.18.2 +packaging==25.0 paginate==0.5.7 pathspec==0.12.1 -platformdirs==4.3.7 -pluggy==1.5.0 -psycopg2-binary==2.9.10 -Pygments==2.19.1 -pymdown-extensions==10.14.3 -PyMySQL==1.1.1 +platformdirs==4.4.0 +pluggy==1.6.0 +psycopg2==2.9.10 +Pygments==2.19.2 +pymdown-extensions==10.16.1 +PyMySQL==1.1.2 pyodbc==5.2.0 -pytest==8.3.5 +pytest==8.4.2 python-dateutil==2.9.0.post0 -PyYAML==6.0.2 -pyyaml_env_tag==0.1 -requests==2.32.3 +PyYAML==6.0.3 +pyyaml_env_tag==1.1 +requests==2.32.5 six==1.17.0 -SQLAlchemy==2.0.40 -typing_extensions==4.13.1 -tzlocal==5.3.1 -urllib3==2.3.0 +SQLAlchemy==2.0.43 +typing_extensions==4.15.0 +urllib3==2.5.0 watchdog==6.0.0 -xmlschema==3.4.5 +xmlschema==4.1.0 diff --git a/src/xml2db/model.py b/src/xml2db/model.py index 1d94c4d..70f8c0e 100644 --- a/src/xml2db/model.py +++ b/src/xml2db/model.py @@ -196,8 +196,12 @@ def _create_table_model( A data model instance. """ table_config = self.tables_config.get(table_name, {}) - table_config["shorten_temp_table_names"] = self.model_config["shorten_temp_table_names"] - table_config["shorten_rel_table_names"] = self.model_config["shorten_rel_table_names"] + table_config["shorten_temp_table_names"] = self.model_config[ + "shorten_temp_table_names" + ] + table_config["shorten_rel_table_names"] = self.model_config[ + "shorten_rel_table_names" + ] if table_config.get("reuse", True): return DataModelTableReused( table_name, diff --git a/src/xml2db/table/duplicated_table.py b/src/xml2db/table/duplicated_table.py index 5a52ba8..8ed486e 100644 --- a/src/xml2db/table/duplicated_table.py +++ b/src/xml2db/table/duplicated_table.py @@ -119,8 +119,12 @@ def get_col(temp=False) -> Iterable[Column]: ) ) - temp_table_name = f"{prefix}{self.name}" - temp_table_name = self.truncate_long_name(temp_table_name) if self.config.get("shorten_temp_table_names") else temp_table_name + temp_table_name = f"{prefix}{self.name}" + temp_table_name = ( + self.truncate_long_name(temp_table_name) + if self.config.get("shorten_table_names") + else temp_table_name + ) # build temporary table self.temp_table = Table( diff --git a/src/xml2db/table/relations.py b/src/xml2db/table/relations.py index 235e792..fbd5dd3 100644 --- a/src/xml2db/table/relations.py +++ b/src/xml2db/table/relations.py @@ -101,9 +101,13 @@ def build_relation_tables(self) -> None: ) prefix = f"temp_{self.table.temp_prefix}_" if self.other_table.is_reused: - temp_table_name = f"{prefix}{self.rel_table_name}" - temp_table_name = self.table.truncate_long_name(temp_table_name) if self.table.config.get("shorten_temp_table_names") else temp_table_name - + temp_table_name = f"{prefix}{self.rel_table_name}" + temp_table_name = ( + self.table.truncate_long_name(temp_table_name) + if self.table.config.get("shorten_temp_table_names") + else temp_table_name + ) + self.temp_rel_table = Table( temp_table_name, self.table.metadata, @@ -135,7 +139,11 @@ def build_relation_tables(self) -> None: ), ) - table_name = self.table.truncate_long_name(self.rel_table_name) if self.table.config.get("shorten_rel_table_names") else self.rel_table_name + table_name = ( + self.table.truncate_long_name(self.rel_table_name) + if self.table.config.get("shorten_rel_table_names") + else self.rel_table_name + ) self.rel_table = Table( table_name, self.table.metadata, diff --git a/src/xml2db/table/reused_table.py b/src/xml2db/table/reused_table.py index c13ff77..0fd41d4 100644 --- a/src/xml2db/table/reused_table.py +++ b/src/xml2db/table/reused_table.py @@ -15,12 +15,14 @@ from .column import DataModelColumn from .transformed_table import DataModelTableTransformed + def shorten_str(x: str, max_len: int = 30) -> str: if len(x) > max_len: h = sha1(x.encode("utf8")) return f"{x[:(max_len - 7)]}_{h.hexdigest()[1:6]}" return x + class DataModelTableReused(DataModelTableTransformed): """A table data model which de-duplicates records in the database based on their hash value. @@ -132,8 +134,12 @@ def get_col(temp=False): ) ) - temp_table_name = f"{prefix}{self.name}" - temp_table_name = self.truncate_long_name(temp_table_name) if self.config.get("shorten_temp_table_names") else temp_table_name + temp_table_name = f"{prefix}{self.name}" + temp_table_name = ( + self.truncate_long_name(temp_table_name) + if self.config.get("shorten_temp_table_names") + else temp_table_name + ) # build temporary table self.temp_table = Table( diff --git a/src/xml2db/table/table.py b/src/xml2db/table/table.py index f4517f7..0540e24 100644 --- a/src/xml2db/table/table.py +++ b/src/xml2db/table/table.py @@ -1,5 +1,5 @@ import hashlib -import base64 +import re from typing import Iterable, List, Any, Union, TYPE_CHECKING import logging import sqlalchemy @@ -15,6 +15,7 @@ logger = logging.getLogger(__name__) + class DataModelTable: """A class representing a database table translated from an XML schema complex type @@ -100,8 +101,9 @@ def _validate_config(self, cfg, db_type): config = { "reuse": check_type(cfg, "reuse", bool, True), "as_columnstore": check_type(cfg, "as_columnstore", bool, False), - "shorten_temp_table_names": check_type(cfg, "shorten_temp_table_names", bool, False), - "shorten_rel_table_names": check_type(cfg, "shorten_rel_table_names", bool, False) + "shorten_table_names": check_type( + cfg, "shorten_table_names", bool, db_type == "postgresql" + ), } if "extra_args" in cfg and not ( isinstance(cfg["extra_args"], list) @@ -120,7 +122,7 @@ def _validate_config(self, cfg, db_type): logger.warning( "Clustered columnstore indexes are only supported with MS SQL Server database" ) - + config["fields"] = cfg.get("fields", {}) return config @@ -409,28 +411,33 @@ def get_entity_rel_diagram(self) -> List: + ["}"] ) return [f" {line}" for line in out] - + def truncate_long_name(self, table_name: str) -> str: - max_len = 63 #both postgres and mysql safe table name len + max_len = 63 # both postgres and mysql safe table name len new_name = table_name - - short_name = "" - shorter_name = "" + is_tmp = "temp" in table_name suffix = f"_{hashlib.md5(table_name.encode('utf-8')).hexdigest()}" if len(table_name) > max_len: - words = table_name.split("_") + # extract words for camelCase and snake_case identifiers + s = re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", table_name) + s = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", s) + words = [word for word in s.split("_") if word] + short_name = "" + shorter_name = "" for word in words: - if len(short_name) + len(word)<= (max_len - 1): - if len(short_name) > 0: short_name += "_" + if len(short_name) + len(word) <= (max_len - 1): + if len(short_name) > 0: + short_name += "_" short_name += f"{word}" if len(shorter_name) + len(word) <= (max_len - 10): - if len(shorter_name) > 0: shorter_name += "_" + if len(shorter_name) > 0: + shorter_name += "_" shorter_name += f"{word}" - #check if sliced name already exists: + # check if sliced name already exists: sentinel = False if is_tmp: # just cut the name up and append the full suffix @@ -446,8 +453,8 @@ def truncate_long_name(self, table_name: str) -> str: if relation.rel_table_name == short_name: sentinel = True break - - # an existing table or relation was found: append a + + # an existing table or relation was found: append a # random-ish suffix to help prevent name collisions if sentinel: # create a more useable/legible short table name diff --git a/src/xml2db/table/transformed_table.py b/src/xml2db/table/transformed_table.py index c88f763..031d858 100644 --- a/src/xml2db/table/transformed_table.py +++ b/src/xml2db/table/transformed_table.py @@ -5,6 +5,7 @@ from .relations import DataModelRelation1, DataModelRelationN from .table import DataModelTable + class DataModelTableTransformed(DataModelTable): """A class extending DataModelTable with transformations diff --git a/tests/conftest.py b/tests/conftest.py index 5ce6745..343fecb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,8 +37,4 @@ def setup_db_model(conn_string, model_config): yield model - try: - model.drop_all_tables() - except Exception as e: - print(f"Unable to drop all tables: {e}") - pass + model.drop_all_tables() diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index f307d82..8f63496 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -7,79 +7,6 @@ from .conftest import list_xml_path from .sample_models import models -import re -import tzlocal -from datetime import datetime -from zoneinfo import ZoneInfo -from typing import Any, Tuple - -# Regex for ISO-like datetime with timezone (adjust as needed) -IANA_TZ = str(tzlocal.get_localzone()) -DATE_PATTERN = re.compile( - r'(\d{2}|\d{4})-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}\.\d{3}(?:[+-]\d{2}:\d{2}|Z)' -) -TIME_ZONE = os.environ.get("TIME_ZONE", IANA_TZ) - -def convert_date_string(date_str: str, to_tz: str) -> str: - # Normalize 'Z' to '+00:00' - if date_str.endswith('Z'): - date_str = date_str[:-1] + '+00:00' - - year = 0 - try: - dt = datetime.strptime(date_str, '%y-%m-%dT%H:%M:%S.%f%z') - year = 2 - except: - try: - dt = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f%z') - year = 4 - except: - pass - - formatted = date_str - if year > 0: - # Convert to your desired timezone (example: US/Mountain) - target_tz = ZoneInfo(to_tz) - dt_new = dt.astimezone(target_tz) - - # Format back to original string format - if year == 2: - formatted = dt_new.strftime('%y-%m-%dT%H:%M:%S.%f%z') - elif year == 4: - formatted = dt_new.strftime('%Y-%m-%dT%H:%M:%S.%f%z') - - # Insert colon in timezone offset for ISO 8601 compliance - formatted = formatted[:-2] + ':' + formatted[-2:] - # Truncate microseconds to milliseconds - dot_idx = formatted.find('.') - if dot_idx != -1: - formatted = formatted[:dot_idx+4] + formatted[dot_idx+7:] - - return formatted - -def update_dates_in_tuple(data: Tuple[str, Any], to_tz: str) -> Tuple[str, Any]: - def update_dates(obj: Any, to_tz: str) -> Any: - if isinstance(obj, dict): - return {k: update_dates(v, to_tz) for k, v in obj.items()} - elif isinstance(obj, list): - return [update_dates(item, to_tz) for item in obj] - elif isinstance(obj, tuple): - return tuple(update_dates(item, to_tz) for item in obj) - elif isinstance(obj, str) and DATE_PATTERN.match(obj): - try: - return convert_date_string(obj, to_tz) - except Exception: - return obj - else: - return obj - - key, nested_dict = data - updated_dict = update_dates(nested_dict, to_tz) - return (key, updated_dict) - -def convert(match, tz=TIME_ZONE): - date_str = match.group() - return convert_date_string(date_str=date_str, to_tz=tz) @pytest.mark.dbtest @pytest.mark.parametrize( @@ -99,7 +26,7 @@ def test_database_xml_roundtrip(setup_db_model, model_config): for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz=TIME_ZONE + f"input_file_path='{file}'", force_tz="Europe/Paris" ) with open(file, "rt") as f: @@ -116,8 +43,7 @@ def test_database_xml_roundtrip(setup_db_model, model_config): encoding="utf-8", xml_declaration=True, ).decode("utf-8") - xml = re.sub(DATE_PATTERN, convert, xml) - ref_xml = re.sub(DATE_PATTERN, convert, ref_xml) + assert xml == ref_xml @@ -139,16 +65,16 @@ def test_database_document_tree_roundtrip(setup_db_model, model_config): for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz=TIME_ZONE + f"input_file_path='{file}'", force_tz="Europe/Paris" ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file, file) - assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( + assert doc.flat_data_to_doc_tree() == remove_record_hash( converter.document_tree - ), TIME_ZONE) + ) @pytest.mark.dbtest @@ -176,16 +102,16 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf for file in xml_files: doc = model.extract_from_database( - f"input_file_path='{file}'", force_tz=TIME_ZONE + f"input_file_path='{file}'", force_tz="Europe/Paris" ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file, file) - assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( + assert doc.flat_data_to_doc_tree() == remove_record_hash( converter.document_tree - ), TIME_ZONE) + ) @pytest.mark.skip @@ -209,13 +135,11 @@ def test_database_single_document_tree_roundtrip(setup_db_model, model_config): doc.insert_into_target_tables() doc = model.extract_from_database( - f"input_file_path='{file_path}'", force_tz=TIME_ZONE + f"input_file_path='{file_path}'", force_tz="Europe/Paris" ) # parse file to doctree for reference converter = XMLConverter(model) converter.parse_xml(file_path, file_path) - assert update_dates_in_tuple(doc.flat_data_to_doc_tree(), TIME_ZONE) == update_dates_in_tuple(remove_record_hash( - converter.document_tree - ), TIME_ZONE) + assert doc.flat_data_to_doc_tree() == remove_record_hash(converter.document_tree)