diff --git a/jupyter_server_fileid/manager.py b/jupyter_server_fileid/manager.py index fc67f60..92962b1 100644 --- a/jupyter_server_fileid/manager.py +++ b/jupyter_server_fileid/manager.py @@ -1,3 +1,5 @@ +import datetime +import importlib import os import posixpath import sqlite3 @@ -5,7 +7,7 @@ import time import uuid from abc import ABC, ABCMeta, abstractmethod -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Generator, List, Optional from jupyter_core.paths import jupyter_data_dir from traitlets import TraitError, Unicode, validate @@ -65,6 +67,8 @@ class BaseFileIdManager(ABC, LoggingConfigurable, metaclass=FileIdManagerMeta): config=True, ) + con: Optional[sqlite3.Connection] + @validate("db_path") def _validate_db_path(self, proposal): if proposal["value"] is None: @@ -95,6 +99,9 @@ def _from_normalized_path(self, path: Optional[str]) -> Optional[str]: def _move_recursive(self, old_path: str, new_path: str, path_mgr: Any = os.path) -> None: """Move all children of a given directory at `old_path` to a new directory at `new_path`, delimited by `sep`.""" + if not hasattr(self, "con") or self.con is None: + return + old_path_glob = old_path + path_mgr.sep + "*" records = self.con.execute( "SELECT id, path FROM Files WHERE path GLOB ?", (old_path_glob,) @@ -108,6 +115,9 @@ def _move_recursive(self, old_path: str, new_path: str, path_mgr: Any = os.path) def _copy_recursive(self, from_path: str, to_path: str, path_mgr: Any = os.path) -> None: """Copy all children of a given directory at `from_path` to a new directory at `to_path`, delimited by `sep`.""" + if not hasattr(self, "con") or self.con is None: + return + from_path_glob = from_path + path_mgr.sep + "*" records = self.con.execute( "SELECT path FROM Files WHERE path GLOB ?", (from_path_glob,) @@ -122,9 +132,106 @@ def _copy_recursive(self, from_path: str, to_path: str, path_mgr: Any = os.path) def _delete_recursive(self, path: str, path_mgr: Any = os.path) -> None: """Delete all children of a given directory, delimited by `sep`.""" + if not hasattr(self, "con") or self.con is None: + return + path_glob = path + path_mgr.sep + "*" self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,)) + def _write_manifest(self) -> None: + """Writes a manifest to a database containing the current manager's + import path.""" + if not hasattr(self, "con") or self.con is None: + return + + manager_module = self.__class__.__module__ + manager_classname = self.__class__.__qualname__ + self.con.execute( + "CREATE TABLE Manifest(" + "manager_module TEXT NOT NULL, " + "manager_classname TEXT NOT NULL" + ")" + ) + self.con.execute( + "INSERT INTO Manifest (manager_module, manager_classname) VALUES (?, ?)", + (manager_module, manager_classname), + ) + + def _check_manifest(self) -> bool: + """ + Returns whether the database file's manifest matches the expected + manifest for this class. If one does not exist, writes the manifest and + returns True. + """ + if not hasattr(self, "con") or self.con is None: + return False + + manifest = None + manager_module = self.__class__.__module__ + manager_classname = self.__class__.__qualname__ + + try: + manifest = self.con.execute( + "SELECT manager_module, manager_classname FROM Manifest" + ).fetchone() + except Exception: + pass + + if manifest is None: + self._write_manifest() + return True + + manifest_module, manifest_classname = manifest + return manifest_module == manager_module and manifest_classname == manager_classname + + def _migrate_database(self) -> None: + """Checks whether the database file's manifest matches the expected + manifest for this class. Writes the manifest if one does not exist. + If one already exists and is incompatible with the expected manifest, + then this method backs up the existing database file into a separate + file, and then migrates the database over via calling `export_row()` on + the old manager and passing the yielded rows to `self.import_rows()`. + """ + if not hasattr(self, "con") or self.con is None: + return + + manifest = self.con.execute( + "SELECT manager_module, manager_classname FROM Manifest" + ).fetchone() + prev_module, prev_classname = manifest + + # first, backup the database at backup_db_path. + db_path_dir, db_path_basename = os.path.split(self.db_path) + backup_db_path = os.path.join( + db_path_dir, datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S-") + db_path_basename + ) + self.con.close() + os.rename(self.db_path, backup_db_path) + + # recreate files table and indices + self.con = sqlite3.connect(self.db_path) + self._write_manifest() + self._create_tables() + + # finally, migrate the database from backup_db_path to the current database. + PrevManager = getattr(importlib.import_module(prev_module), prev_classname) + for row in PrevManager.export_rows(backup_db_path): + self.import_rows(row) + + @staticmethod + @abstractmethod + def export_rows(db_path: str) -> Generator[List[Dict[str, Any]], None, None]: + """Exports the Files table by returning a generator that yields a list + of rows, each encoded as a dictionary. This dictionary must at least + contain `id` and `path`. + """ + pass + + @abstractmethod + def import_rows(self, row: List[Dict[str, Any]]) -> None: + """Imports a row yielded from `export_rows()` into the Files table.""" + pass + @abstractmethod def index(self, path: str) -> Optional[str]: """Returns the file ID for the file corresponding to `path`. @@ -220,35 +327,61 @@ class ArbitraryFileIdManager(BaseFileIdManager): Server 2. """ + con: sqlite3.Connection + @validate("root_dir") def _validate_root_dir(self, proposal): - # Convert root_dir to an api path, since that's essentially what we persist. + """Converts root_dir to a valid API path delimited by forward + slashes.""" if proposal["value"] is None: return "" + root_dir = proposal["value"] + + # if path is local filesystem path, normalize the case first + if os.path.isabs(root_dir): + root_dir = os.path.normcase(root_dir) + + # finally, normalize separators (convert all \ => /) + root_dir = self._normalize_separators(root_dir) + return root_dir - normalized_content_root = self._normalize_separators(proposal["value"]) - return normalized_content_root + def _create_tables(self): + self.con.execute( + "CREATE TABLE IF NOT EXISTS Files(" + "id TEXT PRIMARY KEY NOT NULL, " + "path TEXT NOT NULL UNIQUE" + ")" + ) + self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)") def __init__(self, *args, **kwargs): # pass args and kwargs to parent Configurable super().__init__(*args, **kwargs) + # initialize instance attrs self._update_cursor = False + # initialize connection with db self.log.info(f"ArbitraryFileIdManager : Configured root dir: {self.root_dir}") self.log.info(f"ArbitraryFileIdManager : Configured database path: {self.db_path}") + self.con = sqlite3.connect(self.db_path) - self.log.info("ArbitraryFileIdManager : Successfully connected to database file.") - self.log.info("ArbitraryFileIdManager : Creating File ID tables and indices.") # do not allow reads to block writes. required when using multiple processes self.con.execute("PRAGMA journal_mode = WAL") - self.con.execute( - "CREATE TABLE IF NOT EXISTS Files(" - "id TEXT PRIMARY KEY NOT NULL, " - "path TEXT NOT NULL UNIQUE" - ")" - ) - self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)") + + self.log.info("ArbitraryFileIdManager : Successfully connected to database file.") + + if self._check_manifest(): + # ensure tables and indices are created + self.log.info("ArbitraryFileIdManager : Creating File ID tables and indices.") + self._create_tables() + else: + # migrate database from old manager to current + self.log.info( + "ArbitraryFileIdManager : Database from different File ID manager detected. Migrating tables and indices." + ) + self._migrate_database() + self.con.commit() @staticmethod @@ -264,7 +397,6 @@ def _normalize_path(self, path): # use commonprefix instead of commonpath, since root_dir may not be a # absolute POSIX path. - # norm_root_dir = self._normalize_separators(self.root_dir) path = self._normalize_separators(path) if posixpath.commonprefix([self.root_dir, path]) != self.root_dir: path = posixpath.join(self.root_dir, path) @@ -278,20 +410,35 @@ def _from_normalized_path(self, path: Optional[str]) -> Optional[str]: if path is None: return None - # Convert root_dir to an api path, since that's essentially what we persist. - # norm_root_dir = self._normalize_separators(self.root_dir) if posixpath.commonprefix([self.root_dir, path]) != self.root_dir: return None relpath = posixpath.relpath(path, self.root_dir) return relpath - def _create(self, path: str) -> str: + def _create(self, path: str, id: Optional[str] = None) -> str: path = self._normalize_path(path) - id = self._uuid() + if id is None: + id = self._uuid() self.con.execute("INSERT INTO Files (id, path) VALUES (?, ?)", (id, path)) return id + @staticmethod + def export_rows(db_path: str) -> Generator[List[Dict[str, Any]], None, None]: + con = sqlite3.connect(db_path) + cursor = con.execute("SELECT id, path FROM Files") + row = cursor.fetchone() + while row is not None: + row_dict = {"id": row[0], "path": row[1]} + yield [row_dict] + row = cursor.fetchone() + + def import_rows(self, rows: List[Dict[str, Any]]) -> None: + for row in rows: + id = row["id"] + path = self._normalize_path(row["path"]) + self._create(path, id) + def index(self, path: str) -> str: path = self._normalize_path(path) row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone() @@ -384,6 +531,8 @@ class LocalFileIdManager(BaseFileIdManager): performed during a method's procedure body. """ + con: sqlite3.Connection + @validate("root_dir") def _validate_root_dir(self, proposal): if proposal["value"] is None: @@ -394,20 +543,7 @@ def _validate_root_dir(self, proposal): ) return proposal["value"] - def __init__(self, *args, **kwargs): - # pass args and kwargs to parent Configurable - super().__init__(*args, **kwargs) - # initialize instance attrs - self._update_cursor = False - self._last_sync = 0.0 - # initialize connection with db - self.log.info(f"LocalFileIdManager : Configured root dir: {self.root_dir}") - self.log.info(f"LocalFileIdManager : Configured database path: {self.db_path}") - self.con = sqlite3.connect(self.db_path) - self.log.info("LocalFileIdManager : Successfully connected to database file.") - self.log.info("LocalFileIdManager : Creating File ID tables and indices.") - # do not allow reads to block writes. required when using multiple processes - self.con.execute("PRAGMA journal_mode = WAL") + def _create_tables(self): self.con.execute( "CREATE TABLE IF NOT EXISTS Files(" "id TEXT PRIMARY KEY NOT NULL, " @@ -420,10 +556,42 @@ def __init__(self, *args, **kwargs): "is_dir TINYINT NOT NULL" ")" ) - self._index_all() # no need to index ino as it is autoindexed by sqlite via UNIQUE constraint self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)") self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_is_dir ON Files (is_dir)") + + def __init__(self, *args, **kwargs): + # pass args and kwargs to parent Configurable + super().__init__(*args, **kwargs) + + # initialize instance attrs + self._update_cursor = False + self._last_sync = 0.0 + + self.log.info(f"LocalFileIdManager : Configured root dir: {self.root_dir}") + self.log.info(f"LocalFileIdManager : Configured database path: {self.db_path}") + + # initialize connection with db + self.con = sqlite3.connect(self.db_path) + # do not allow reads to block writes. required when using multiple processes + self.con.execute("PRAGMA journal_mode = WAL") + + self.log.info("LocalFileIdManager : Successfully connected to database file.") + + if self._check_manifest(): + # ensure tables and indices are created + self.log.info("LocalFileIdManager : Creating File ID tables and indices.") + self._create_tables() + else: + # migrate database from old manager to current + self.log.info( + "LocalFileIdManager : Database from different File ID manager detected. Migrating tables and indices." + ) + self._migrate_database() + + # index all directories under content root + self._index_all() + self.con.commit() def _normalize_path(self, path): @@ -632,7 +800,7 @@ def _stat(self, path): return self._parse_raw_stat(raw_stat) - def _create(self, path, stat_info): + def _create(self, path, stat_info, id=None): """Creates a record given its path and stat info. Returns the new file ID. @@ -642,7 +810,8 @@ def _create(self, path, stat_info): dangerous and may throw a runtime error if the file is not guaranteed to have a unique `ino`. """ - id = self._uuid() + if not id: + id = self._uuid() self.con.execute( "INSERT INTO Files (id, path, ino, crtime, mtime, is_dir) VALUES (?, ?, ?, ?, ?, ?)", (id, path, stat_info.ino, stat_info.crtime, stat_info.mtime, stat_info.is_dir), @@ -685,6 +854,33 @@ def _update(self, id, stat_info=None, path=None): ) return + @staticmethod + def export_rows(db_path: str) -> Generator[List[Dict[str, Any]], None, None]: + con = sqlite3.connect(db_path) + cursor = con.execute("SELECT id, path, ino, crtime, mtime, is_dir FROM Files") + row = cursor.fetchone() + while row is not None: + row_dict = { + "id": row[0], + "path": row[1], + "ino": row[2], + "crtime": row[3], + "mtime": row[4], + "is_dir": row[5], + } + yield [row_dict] + row = cursor.fetchone() + + def import_rows(self, rows: List[Dict[str, Any]]) -> None: + for row in rows: + id = row["id"] + path = self._normalize_path(row["path"]) + stat_info = self._stat(path) + if stat_info is None: + return + + self._create(path, stat_info, id) + def index(self, path, stat_info=None, commit=True): """Returns the file ID for the file at `path`, creating a new file ID if one does not exist. Returns None only if file does not exist at path.""" diff --git a/tests/test_manager.py b/tests/test_manager.py index 5565390..4b45a22 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -91,6 +91,33 @@ def _normalize_path_arbitrary(fid_manager, path): return path +def test_migrate_arbitrary_to_local(fid_db_path, jp_root_dir, test_path, test_path_child): + arbitrary = ArbitraryFileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir)) + arbitrary.con.execute("PRAGMA journal_mode = off") + + id_1 = arbitrary.index(test_path) + id_2 = arbitrary.index(test_path_child) + del arbitrary + local = LocalFileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir)) + + assert local.get_path(id_1) == test_path + assert local.get_path(id_2) == test_path_child + + +def test_migrate_local_to_arbitrary(fid_db_path, jp_root_dir, test_path, test_path_child): + local = LocalFileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir)) + local.con.execute("PRAGMA journal_mode = off") + + id_1 = local.index(test_path) + id_2 = local.index(test_path_child) + del local + # need to normalize the arbitrary content root to produce correct API paths + arbitrary = ArbitraryFileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir)) + + assert arbitrary.get_path(id_1) == test_path + assert arbitrary.get_path(id_2) == test_path_child + + def get_id_nosync(fid_manager, path): # We need to first put the path into a form the fileId manager implementation will for persistence. if isinstance(fid_manager, LocalFileIdManager):