Skip to content

Commit 2ed8dc7

Browse files
committed
Extractor: move overlay-changes check from traverser to worker
This way, we filter both root modules and (transitive) imports against the overlay-changes json.
1 parent 01cf7e9 commit 2ed8dc7

File tree

2 files changed

+36
-31
lines changed

2 files changed

+36
-31
lines changed

python/extractor/semmle/traverser.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,6 @@ def __init__(self, options, modulenames, logger):
3131
if not os.path.exists(p) and not options.ignore_missing_modules:
3232
raise FileNotFoundError("'%s' does not exist." % p)
3333
self.paths.add(p)
34-
# During overlay extraction, only traverse the files that were changed.
35-
self.overlay_changes = None
36-
if 'CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES' in os.environ:
37-
overlay_changes_file = os.environ['CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES']
38-
logger.info("Overlay extraction mode: only extracting files changed according to '%s'", overlay_changes_file)
39-
try:
40-
with open(overlay_changes_file, 'r', encoding='utf-8') as f:
41-
data = json.load(f)
42-
changed_paths = data.get('changes', [])
43-
self.overlay_changes = { os.path.abspath(p) for p in changed_paths }
44-
except (IOError, ValueError) as e:
45-
logger.warn("Failed to read overlay changes from '%s' (falling back to full extraction): %s", overlay_changes_file, e)
46-
self.overlay_changes = None
4734
self.exclude_paths = set([ os.path.abspath(f) for f in options.exclude_file ])
4835
self.exclude = exclude_filter_from_options(options)
4936
self.filter = filter_from_options_and_environment(options)
@@ -62,20 +49,11 @@ def __iter__(self):
6249
if mod is None:
6350
self.logger.error("No module named '%s'.", name)
6451
raise ExtractorFailure()
65-
if self.overlay_changes is not None and mod.path not in self.overlay_changes:
66-
self.logger.debug("Skipping module '%s' as it was not changed in overlay extraction.", name)
67-
continue
6852
yield mod.get_extractable()
6953
for path in self.paths:
70-
if self.overlay_changes is not None and path not in self.overlay_changes:
71-
self.logger.debug("Skipping path '%s' as it was not changed in overlay extraction.", path)
72-
continue
7354
yield Extractable.from_path(path)
7455
for path in self.recurse_files:
7556
for modpath in self._treewalk(path):
76-
if self.overlay_changes is not None and modpath not in self.overlay_changes:
77-
self.logger.debug("Skipping file '%s' as it was not changed in overlay extraction.", modpath)
78-
continue
7957
yield Extractable.from_path(modpath)
8058
for name in self.recurse_packages:
8159
mod = self.finder.find(name)
@@ -89,9 +67,6 @@ def __iter__(self):
8967
self.logger.error("Package '%s' does not have a path.", name)
9068
raise ExtractorFailure()
9169
for modpath in self._treewalk(path):
92-
if self.overlay_changes is not None and modpath not in self.overlay_changes:
93-
self.logger.debug("Skipping package '%s' as it was not changed in overlay extraction.", modpath)
94-
continue
9570
yield Extractable.from_path(modpath)
9671

9772
def _treewalk(self, path):

python/extractor/semmle/worker.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from semmle.profiling import get_profiler
1212
from semmle.path_rename import renamer_from_options_and_env
1313
from semmle.logging import WARN, recursion_error_message, internal_error_message, Logger
14+
from semmle.util import FileExtractable, FolderExtractable
1415

1516
class ExtractorFailure(Exception):
1617
'Generic exception representing the failure of an extractor.'
@@ -19,17 +20,32 @@ class ExtractorFailure(Exception):
1920

2021
class ModuleImportGraph(object):
2122

22-
def __init__(self, max_depth):
23+
def __init__(self, max_depth, logger: Logger):
2324
self.modules = {}
2425
self.succ = defaultdict(set)
2526
self.todo = set()
2627
self.done = set()
2728
self.max_depth = max_depth
29+
self.logger = logger
30+
31+
# During overlay extraction, only traverse the files that were changed.
32+
self.overlay_changes = None
33+
if 'CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES' in os.environ:
34+
overlay_changes_file = os.environ['CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES']
35+
logger.info("Overlay extraction mode: only extracting files changed according to '%s'", overlay_changes_file)
36+
try:
37+
with open(overlay_changes_file, 'r', encoding='utf-8') as f:
38+
data = json.load(f)
39+
changed_paths = data.get('changes', [])
40+
self.overlay_changes = { os.path.abspath(p) for p in changed_paths }
41+
except (IOError, ValueError) as e:
42+
logger.warn("Failed to read overlay changes from '%s' (falling back to full extraction): %s", overlay_changes_file, e)
43+
self.overlay_changes = None
2844

2945
def add_root(self, mod):
3046
self.modules[mod] = 0
3147
if mod not in self.done:
32-
self.todo.add(mod)
48+
self.add_todo(mod)
3349

3450
def add_import(self, mod, imported):
3551
assert mod in self.modules
@@ -39,7 +55,7 @@ def add_import(self, mod, imported):
3955
self._reduce_depth(imported, self.modules[mod] + 1)
4056
else:
4157
if self.modules[mod] < self.max_depth and imported not in self.done:
42-
self.todo.add(imported)
58+
self.add_todo(imported)
4359
self.modules[imported] = self.modules[mod] + 1
4460

4561
def _reduce_depth(self, mod, depth):
@@ -48,7 +64,7 @@ def _reduce_depth(self, mod, depth):
4864
if depth > self.max_depth:
4965
return
5066
if mod not in self.done:
51-
self.todo.add(mod)
67+
self.add_todo(mod)
5268
self.modules[mod] = depth
5369
for imp in self.succ[mod]:
5470
self._reduce_depth(imp, depth+1)
@@ -61,11 +77,25 @@ def get(self):
6177

6278
def push_back(self, mod):
6379
self.done.remove(mod)
64-
self.todo.add(mod)
80+
self.add_todo(mod)
6581

6682
def empty(self):
6783
return not self.todo
6884

85+
def add_todo(self, mod):
86+
if not self._module_in_overlay_changes(mod):
87+
self.logger.debug("Skipping module '%s' as it was not changed in overlay extraction.", mod)
88+
return
89+
self.todo.add(mod)
90+
91+
def _module_in_overlay_changes(self, mod):
92+
if self.overlay_changes is not None:
93+
if isinstance(mod, FileExtractable):
94+
return mod.path in self.overlay_changes
95+
if isinstance(mod, FolderExtractable):
96+
return mod.path + '/__init__.py' in self.overlay_changes
97+
return True
98+
6999
class ExtractorPool(object):
70100
'''Pool of worker processes running extractors'''
71101

@@ -90,7 +120,7 @@ def __init__(self, outdir, archive, proc_count, options, logger: Logger):
90120
self.enqueued = set()
91121
self.done = set()
92122
self.requirements = {}
93-
self.import_graph = ModuleImportGraph(options.max_import_depth)
123+
self.import_graph = ModuleImportGraph(options.max_import_depth, logger)
94124
logger.debug("Source archive: %s", archive)
95125
self.logger = logger
96126
DiagnosticsWriter.create_output_dir()

0 commit comments

Comments
 (0)