1111results into a single Elasticsearch index. Each result document includes:
1212- All original SARIF result fields (ruleId, message, locations, etc.)
1313- Derived fields (ruleGroup, ruleLanguage) parsed from ruleId
14- - ONLY versionControlProvenance from run (minimal enrichment)
14+ - versionControlProvenance from run, OR derived from filename pattern
1515- Source file tracking metadata
1616
17+ Repository URI Derivation from Filename:
18+ If versionControlProvenance is missing or lacks repositoryUri, it will be derived
19+ from SARIF filenames matching: [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
20+ Example: "cpp-misra_nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
21+
1722This approach keeps documents minimal by indexing ONLY the result objects to avoid
1823Elasticsearch size limits. Tool info and automation details are NOT included.
1924
@@ -69,10 +74,11 @@ def replace_var(match):
6974DEFAULT_ELASTIC_HOST = "http://localhost:9200"
7075SARIF_VERSION = "2.1.0"
7176
72- # Elasticsearch mapping optimized for SARIF result documents
77+ # Elasticsearch mapping optimized for SARIF result documents
7378# Minimal mapping - only results with versionControlProvenance enrichment
7479SARIF_MAPPING = {
7580 "mappings" : {
81+ "dynamic" : True , # Allow dynamic field mapping for any unmapped fields
7682 "properties" : {
7783 # Core SARIF result fields
7884 "ruleId" : {"type" : "keyword" },
@@ -300,6 +306,45 @@ def parse_rule_id(rule_id):
300306 return rule_group , rule_language
301307
302308
309+ def parse_repository_uri_from_filename (filename ):
310+ """
311+ Parse repository URI from SARIF filename following the pattern:
312+ [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
313+
314+ Examples:
315+ - "nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
316+ - "cpp-misra_nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
317+ - "tmux_tmux.sarif" -> "https://github.com/tmux/tmux"
318+
319+ Returns:
320+ str or None: The repository URI if parsing succeeds, None otherwise
321+ """
322+ # Remove .sarif extension
323+ name = filename .replace ('.sarif' , '' )
324+
325+ # Split by underscore
326+ parts = name .split ('_' )
327+
328+ # Need at least org_repo (2 parts)
329+ if len (parts ) < 2 :
330+ return None
331+
332+ # Check if first part contains a hyphen (lang-framework pattern)
333+ if '-' in parts [0 ]:
334+ # Pattern: lang-framework_org_repo[_id]
335+ # Skip the lang-framework prefix
336+ if len (parts ) < 3 :
337+ return None
338+ org = parts [1 ]
339+ repo = parts [2 ]
340+ else :
341+ # Pattern: org_repo[_id]
342+ org = parts [0 ]
343+ repo = parts [1 ]
344+
345+ return f"https://github.com/{ org } /{ repo } "
346+
347+
303348def sarif_results_generator (sarif_files , index_name ):
304349 """
305350 Generator that yields Elasticsearch bulk actions for individual SARIF results.
@@ -309,9 +354,14 @@ def sarif_results_generator(sarif_files, index_name):
309354 2. Extracts each result from runs[].results[]
310355 3. Creates a separate Elasticsearch document per result
311356 4. Adds derived fields (ruleGroup, ruleLanguage) from ruleId parsing
312- 5. ONLY enriches with versionControlProvenance from run (minimal overhead)
357+ 5. Enriches with versionControlProvenance from run, or derives repositoryUri from filename
313358 6. Adds source file tracking metadata
314359
360+ Filename Pattern for Repository URI Derivation:
361+ - [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
362+ - Examples: "nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
363+ - Examples: "cpp-misra_tmux_tmux.sarif" -> "https://github.com/tmux/tmux"
364+
315365 This approach keeps document sizes minimal by ONLY indexing the result objects
316366 themselves plus minimal enrichment data, avoiding the overhead of tool info,
317367 automation details, and other run-level data.
@@ -324,6 +374,11 @@ def sarif_results_generator(sarif_files, index_name):
324374
325375 for sarif_file in sarif_files :
326376 print (f"Processing { sarif_file .name } ..." )
377+
378+ # Parse repository URI from filename
379+ repo_uri_from_filename = parse_repository_uri_from_filename (sarif_file .name )
380+ if repo_uri_from_filename :
381+ print (f" → Derived repository URI: { repo_uri_from_filename } " )
327382
328383 try :
329384 with open (sarif_file , "r" , encoding = "utf-8" ) as f :
@@ -352,6 +407,17 @@ def sarif_results_generator(sarif_files, index_name):
352407
353408 # Extract ONLY versionControlProvenance from run (minimal enrichment)
354409 version_control_provenance = run .get ("versionControlProvenance" , [])
410+
411+ # If no versionControlProvenance in run, create from filename
412+ if not version_control_provenance and repo_uri_from_filename :
413+ version_control_provenance = [{
414+ "repositoryUri" : repo_uri_from_filename
415+ }]
416+ # If versionControlProvenance exists but missing repositoryUri, add it from filename
417+ elif version_control_provenance and repo_uri_from_filename :
418+ # Check if repositoryUri is missing from the first entry
419+ if not version_control_provenance [0 ].get ("repositoryUri" ):
420+ version_control_provenance [0 ]["repositoryUri" ] = repo_uri_from_filename
355421
356422 for result_index , result in enumerate (results ):
357423 # Create a document that includes ONLY the result fields
0 commit comments