diff --git a/.gitignore b/.gitignore index aa54c11..efede9f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ dist venv .spyproject .idea -site \ No newline at end of file +site +.env +uv.lock diff --git a/pyproject.toml b/pyproject.toml index 6360f7f..71b8729 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "xml2db" -version = "0.12.5" +version = "0.12.6" authors = [ { name="Commission de régulation de l'énergie", email="opensource@cre.fr" }, ] @@ -36,3 +36,6 @@ markers = [ "dbtest: marks tests as integration tests requiring a database backend (deselect with '-m \"not dbtest\"')", ] junit_family = "xunit2" + +[tool.uv] +package = true diff --git a/src/xml2db/model.py b/src/xml2db/model.py index fd666d4..70f8c0e 100644 --- a/src/xml2db/model.py +++ b/src/xml2db/model.py @@ -143,6 +143,8 @@ def _validate_config(self, cfg): for key, exp_type, default in [ ("as_columnstore", bool, False), ("row_numbers", bool, False), + ("shorten_temp_table_names", bool, False), + ("shorten_rel_table_names", bool, False), ("document_tree_hook", callable, None), ("document_tree_node_hook", callable, None), ("record_hash_column_name", str, "xml2db_record_hash"), @@ -194,6 +196,12 @@ def _create_table_model( A data model instance. """ table_config = self.tables_config.get(table_name, {}) + table_config["shorten_temp_table_names"] = self.model_config[ + "shorten_temp_table_names" + ] + table_config["shorten_rel_table_names"] = self.model_config[ + "shorten_rel_table_names" + ] if table_config.get("reuse", True): return DataModelTableReused( table_name, diff --git a/src/xml2db/table/duplicated_table.py b/src/xml2db/table/duplicated_table.py index 72a5b03..8ed486e 100644 --- a/src/xml2db/table/duplicated_table.py +++ b/src/xml2db/table/duplicated_table.py @@ -119,9 +119,16 @@ def get_col(temp=False) -> Iterable[Column]: ) ) + temp_table_name = f"{prefix}{self.name}" + temp_table_name = ( + self.truncate_long_name(temp_table_name) + if self.config.get("shorten_table_names") + else temp_table_name + ) + # build temporary table self.temp_table = Table( - f"{prefix}{self.name}", + temp_table_name, self.metadata, Column(f"pk_{self.name}", Integer), *get_col(temp=True), diff --git a/src/xml2db/table/relations.py b/src/xml2db/table/relations.py index 96245d1..fbd5dd3 100644 --- a/src/xml2db/table/relations.py +++ b/src/xml2db/table/relations.py @@ -101,8 +101,15 @@ def build_relation_tables(self) -> None: ) prefix = f"temp_{self.table.temp_prefix}_" if self.other_table.is_reused: + temp_table_name = f"{prefix}{self.rel_table_name}" + temp_table_name = ( + self.table.truncate_long_name(temp_table_name) + if self.table.config.get("shorten_temp_table_names") + else temp_table_name + ) + self.temp_rel_table = Table( - f"{prefix}{self.rel_table_name}", + temp_table_name, self.table.metadata, Column(f"temp_fk_{self.table.name}", Integer, nullable=False), Column(f"fk_{self.table.name}", Integer), @@ -132,8 +139,13 @@ def build_relation_tables(self) -> None: ), ) + table_name = ( + self.table.truncate_long_name(self.rel_table_name) + if self.table.config.get("shorten_rel_table_names") + else self.rel_table_name + ) self.rel_table = Table( - self.rel_table_name, + table_name, self.table.metadata, Column( f"fk_{self.table.name}", diff --git a/src/xml2db/table/reused_table.py b/src/xml2db/table/reused_table.py index 3c09851..0fd41d4 100644 --- a/src/xml2db/table/reused_table.py +++ b/src/xml2db/table/reused_table.py @@ -134,9 +134,16 @@ def get_col(temp=False): ) ) + temp_table_name = f"{prefix}{self.name}" + temp_table_name = ( + self.truncate_long_name(temp_table_name) + if self.config.get("shorten_temp_table_names") + else temp_table_name + ) + # build temporary table self.temp_table = Table( - f"{prefix}{self.name}", + temp_table_name, self.metadata, Column(f"pk_{self.name}", Integer), Column( diff --git a/src/xml2db/table/table.py b/src/xml2db/table/table.py index 6e11a24..0540e24 100644 --- a/src/xml2db/table/table.py +++ b/src/xml2db/table/table.py @@ -1,3 +1,5 @@ +import hashlib +import re from typing import Iterable, List, Any, Union, TYPE_CHECKING import logging import sqlalchemy @@ -99,6 +101,9 @@ def _validate_config(self, cfg, db_type): config = { "reuse": check_type(cfg, "reuse", bool, True), "as_columnstore": check_type(cfg, "as_columnstore", bool, False), + "shorten_table_names": check_type( + cfg, "shorten_table_names", bool, db_type == "postgresql" + ), } if "extra_args" in cfg and not ( isinstance(cfg["extra_args"], list) @@ -324,10 +329,13 @@ def create_tables(self, engine: sqlalchemy.engine.base.Engine, temp: bool = Fals temp: if True, create temporary (prefixed) tables """ if temp: + logging.info(f"Creating temp table: {self.temp_table.name}") self.temp_table.create(engine, checkfirst=True) else: + logging.info(f"Creating table: {self.table.name}") self.table.create(engine, checkfirst=True) for relation in self.relations_n.values(): + logging.info(f"Creating relation: {relation.name}") relation.create_table(engine, temp) def get_insert_temp_records_statements( @@ -403,3 +411,60 @@ def get_entity_rel_diagram(self) -> List: + ["}"] ) return [f" {line}" for line in out] + + def truncate_long_name(self, table_name: str) -> str: + max_len = 63 # both postgres and mysql safe table name len + new_name = table_name + + is_tmp = "temp" in table_name + suffix = f"_{hashlib.md5(table_name.encode('utf-8')).hexdigest()}" + + if len(table_name) > max_len: + # extract words for camelCase and snake_case identifiers + s = re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", table_name) + s = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", s) + words = [word for word in s.split("_") if word] + + short_name = "" + shorter_name = "" + for word in words: + if len(short_name) + len(word) <= (max_len - 1): + if len(short_name) > 0: + short_name += "_" + short_name += f"{word}" + if len(shorter_name) + len(word) <= (max_len - 10): + if len(shorter_name) > 0: + shorter_name += "_" + shorter_name += f"{word}" + + # check if sliced name already exists: + sentinel = False + if is_tmp: + # just cut the name up and append the full suffix + # this doesn't need to be human readable / usable + short_name = short_name[:30] + sentinel = True + else: + for tbl in self.data_model.tables.values(): + if sentinel or tbl.name == short_name: + sentinel = True + break + for relation in tbl.relations_n.values(): + if relation.rel_table_name == short_name: + sentinel = True + break + + # an existing table or relation was found: append a + # random-ish suffix to help prevent name collisions + if sentinel: + # create a more useable/legible short table name + suffix = f"_{suffix[:8]}" + short_name = shorter_name + else: + # nothing was found so we can just run with the short name + suffix = "" + + # finalize the new shortened name + new_name = f"{short_name}{suffix}" + + return new_name