|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Validate module-level imports are limited to Python's standard library. |
| 3 | +
|
| 4 | +This script enforces the repository’s import guard strategy: third-party or |
| 5 | +heavy dependencies must be imported within the function or pipeline body rather |
| 6 | +than at module import time. |
| 7 | +
|
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import argparse |
| 13 | +import ast |
| 14 | +import json |
| 15 | +import os |
| 16 | +import pkgutil |
| 17 | +import sys |
| 18 | +import sysconfig |
| 19 | +from pathlib import Path |
| 20 | +from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple |
| 21 | + |
| 22 | + |
| 23 | +DEFAULT_CONFIG_PATH = Path("scripts/import_exceptions.json") |
| 24 | +DEFAULT_REQUIREMENT_FILES = ("dev-requirements.txt", "test-requirements.txt") |
| 25 | + |
| 26 | + |
| 27 | +class ImportGuardConfig: |
| 28 | + """Holds allow-list data for the import guard.""" |
| 29 | + |
| 30 | + def __init__( |
| 31 | + self, |
| 32 | + module_allowlist: Optional[Iterable[str]] = None, |
| 33 | + file_allowlist: Optional[Dict[str, Iterable[str]]] = None, |
| 34 | + ) -> None: |
| 35 | + """Initialize configuration from module and path allow lists.""" |
| 36 | + self.module_allowlist: Set[str] = {canonicalize_module_name(item) for item in module_allowlist or []} |
| 37 | + self.file_allowlist: Dict[Path, Set[str]] = {} |
| 38 | + for raw_path, modules in (file_allowlist or {}).items(): |
| 39 | + normalized = Path(raw_path).resolve() |
| 40 | + self.file_allowlist[normalized] = {canonicalize_module_name(mod) for mod in modules} |
| 41 | + |
| 42 | + @classmethod |
| 43 | + def from_path(cls, path: Path) -> "ImportGuardConfig": |
| 44 | + """Instantiate configuration from a JSON file if it exists.""" |
| 45 | + if not path.exists(): |
| 46 | + return cls() |
| 47 | + with path.open("r", encoding="utf-8") as handle: |
| 48 | + data = json.load(handle) |
| 49 | + modules = data.get("modules", []) |
| 50 | + files = data.get("files", {}) |
| 51 | + return cls(modules, files) |
| 52 | + |
| 53 | + def is_allowed(self, module: str, file_path: Path) -> bool: |
| 54 | + """Return True when a module is allow-listed for the given file path.""" |
| 55 | + canonical_module = canonicalize_module_name(module) |
| 56 | + if canonical_module in self.module_allowlist: |
| 57 | + return True |
| 58 | + resolved = file_path.resolve() |
| 59 | + for path, modules in self.file_allowlist.items(): |
| 60 | + if path == resolved: |
| 61 | + return canonical_module in modules |
| 62 | + if path.is_dir() and path in resolved.parents: |
| 63 | + return canonical_module in modules |
| 64 | + return False |
| 65 | + |
| 66 | + |
| 67 | +def canonicalize_module_name(name: str) -> str: |
| 68 | + """Return the top-level portion of a dotted module path.""" |
| 69 | + return name.split(".")[0] |
| 70 | + |
| 71 | + |
| 72 | +def discover_python_files(paths: Sequence[str]) -> List[Path]: |
| 73 | + """Collect Python files from individual files or by walking directories.""" |
| 74 | + python_files: List[Path] = [] |
| 75 | + for raw_path in paths: |
| 76 | + path = Path(raw_path) |
| 77 | + if path.is_file() and path.suffix == ".py": |
| 78 | + python_files.append(path) |
| 79 | + elif path.is_dir(): |
| 80 | + for candidate in path.rglob("*.py"): |
| 81 | + if any(part.startswith(".") for part in candidate.parts): |
| 82 | + continue |
| 83 | + python_files.append(candidate) |
| 84 | + return python_files |
| 85 | + |
| 86 | + |
| 87 | +def build_stdlib_index() -> Set[str]: |
| 88 | + """Return a set containing names of standard-library modules.""" |
| 89 | + candidates: Set[str] = set(sys.builtin_module_names) |
| 90 | + stdlib_path = Path(sysconfig.get_paths()["stdlib"]).resolve() |
| 91 | + for module in pkgutil.walk_packages([str(stdlib_path)]): |
| 92 | + module_name = canonicalize_module_name(module.name) |
| 93 | + candidates.add(module_name) |
| 94 | + return candidates |
| 95 | + |
| 96 | + |
| 97 | +def extract_top_level_imports(node: ast.AST) -> Iterable[Tuple[str, int]]: |
| 98 | + """Yield (module, line) tuples for top-level import statements.""" |
| 99 | + for child in ast.iter_child_nodes(node): |
| 100 | + if isinstance(child, (ast.Import, ast.ImportFrom)): |
| 101 | + if isinstance(child, ast.ImportFrom) and child.level > 0: |
| 102 | + continue # relative import is considered safe |
| 103 | + module_name = None |
| 104 | + if isinstance(child, ast.Import): |
| 105 | + if not child.names: |
| 106 | + continue |
| 107 | + module_name = child.names[0].name |
| 108 | + else: |
| 109 | + if child.module is None: |
| 110 | + continue |
| 111 | + module_name = child.module |
| 112 | + if module_name: |
| 113 | + yield canonicalize_module_name(module_name), child.lineno |
| 114 | + elif isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): |
| 115 | + # Imports within functions/classes are intentionally allowed. |
| 116 | + continue |
| 117 | + else: |
| 118 | + yield from extract_top_level_imports(child) |
| 119 | + |
| 120 | + |
| 121 | +def find_asset_root(path: Path) -> Optional[Path]: |
| 122 | + """Find the nearest directory containing metadata for an asset.""" |
| 123 | + for parent in [path] + list(path.parents): |
| 124 | + if (parent / "metadata.yaml").exists(): |
| 125 | + return parent |
| 126 | + return None |
| 127 | + |
| 128 | + |
| 129 | +def ensure_dependency_files(asset_root: Path) -> Optional[str]: |
| 130 | + """Return warning message when dev/test requirement files are absent.""" |
| 131 | + for filename in DEFAULT_REQUIREMENT_FILES: |
| 132 | + if (asset_root / filename).exists(): |
| 133 | + return None |
| 134 | + return f"{asset_root} is missing a dev/test requirements file " f"({', '.join(DEFAULT_REQUIREMENT_FILES)})" |
| 135 | + |
| 136 | + |
| 137 | +def check_imports(files: Sequence[Path], config: ImportGuardConfig) -> int: |
| 138 | + """Validate import style across a collection of Python files.""" |
| 139 | + stdlib_modules = build_stdlib_index() |
| 140 | + violations: List[str] = [] |
| 141 | + dependency_warnings: Set[str] = set() |
| 142 | + |
| 143 | + for file_path in files: |
| 144 | + try: |
| 145 | + with file_path.open("r", encoding="utf-8") as handle: |
| 146 | + tree = ast.parse(handle.read(), filename=str(file_path)) |
| 147 | + except SyntaxError as exc: |
| 148 | + violations.append(f"{file_path}: failed to parse ({exc})") |
| 149 | + continue |
| 150 | + |
| 151 | + for module_name, lineno in extract_top_level_imports(tree): |
| 152 | + if module_name in stdlib_modules: |
| 153 | + continue |
| 154 | + if config.is_allowed(module_name, file_path): |
| 155 | + asset_root = find_asset_root(file_path.parent) |
| 156 | + if asset_root: |
| 157 | + warning = ensure_dependency_files(asset_root) |
| 158 | + if warning: |
| 159 | + dependency_warnings.add(warning) |
| 160 | + continue |
| 161 | + violations.append(f"{file_path}:{lineno} imports non-stdlib module '{module_name}' at top level") |
| 162 | + |
| 163 | + for warning in sorted(dependency_warnings): |
| 164 | + print(f"WARNING: {warning}", file=sys.stderr) |
| 165 | + |
| 166 | + if violations: |
| 167 | + for entry in violations: |
| 168 | + print(entry, file=sys.stderr) |
| 169 | + return 1 |
| 170 | + return 0 |
| 171 | + |
| 172 | + |
| 173 | +def parse_args() -> argparse.Namespace: |
| 174 | + """Parse command-line arguments.""" |
| 175 | + parser = argparse.ArgumentParser(description="Ensure top-level Python imports are limited to the standard library.") |
| 176 | + parser.add_argument( |
| 177 | + "paths", |
| 178 | + nargs="+", |
| 179 | + help="Files or directories to inspect (recursively).", |
| 180 | + ) |
| 181 | + parser.add_argument( |
| 182 | + "--config", |
| 183 | + default=str(DEFAULT_CONFIG_PATH), |
| 184 | + help="Path to JSON configuration file with allowed modules/files.", |
| 185 | + ) |
| 186 | + return parser.parse_args() |
| 187 | + |
| 188 | + |
| 189 | +def main() -> int: |
| 190 | + """Run the import guard script.""" |
| 191 | + args = parse_args() |
| 192 | + config = ImportGuardConfig.from_path(Path(args.config)) |
| 193 | + python_files = discover_python_files(args.paths) |
| 194 | + if not python_files: |
| 195 | + print("No Python files found to inspect.", file=sys.stderr) |
| 196 | + return 0 |
| 197 | + return check_imports(python_files, config) |
| 198 | + |
| 199 | + |
| 200 | +if __name__ == "__main__": |
| 201 | + sys.exit(main()) |
0 commit comments