@@ -64,7 +64,8 @@ def replace_var(match):
6464DEFAULT_ELASTIC_HOST = "http://localhost:9200"
6565SARIF_VERSION = "2.1.0"
6666
67- # Elasticsearch mapping optimized for SARIF result documents
67+ # Elasticsearch mapping optimized for SARIF result documents
68+ # Minimal mapping - only results with versionControlProvenance enrichment
6869SARIF_MAPPING = {
6970 "mappings" : {
7071 "properties" : {
@@ -149,37 +150,15 @@ def replace_var(match):
149150 "occurrenceCount" : {"type" : "integer" },
150151 "rank" : {"type" : "float" },
151152 "baselineState" : {"type" : "keyword" },
152- # Run-level metadata (tool, repo info, etc.)
153- "run" : {
153+ # ONLY versionControlProvenance from run-level (minimal enrichment)
154+ "versionControlProvenance" : {
155+ "type" : "nested" ,
154156 "properties" : {
155- "tool" : {
156- "properties" : {
157- "driver" : {
158- "properties" : {
159- "name" : {"type" : "keyword" },
160- "organization" : {"type" : "keyword" },
161- "product" : {"type" : "keyword" },
162- "version" : {"type" : "keyword" },
163- "semanticVersion" : {"type" : "keyword" },
164- }
165- }
166- }
167- },
168- "automationDetails" : {
169- "properties" : {
170- "id" : {"type" : "keyword" },
171- "guid" : {"type" : "keyword" },
172- "correlationGuid" : {"type" : "keyword" },
173- }
174- },
175- "versionControlProvenance" : {
176- "type" : "nested" ,
177- "properties" : {
178- "repositoryUri" : {"type" : "keyword" },
179- "revisionId" : {"type" : "keyword" },
180- },
181- },
182- }
157+ "repositoryUri" : {"type" : "keyword" },
158+ "revisionId" : {"type" : "keyword" },
159+ "branch" : {"type" : "keyword" },
160+ "revisionTag" : {"type" : "keyword" },
161+ },
183162 },
184163 # Metadata for tracking source SARIF file
185164 "_sarif_source" : {
@@ -197,8 +176,6 @@ def replace_var(match):
197176 "analysis" : {"analyzer" : {"sarif_text" : {"type" : "standard" , "stopwords" : "_none_" }}},
198177 },
199178}
200-
201-
202179def create_elasticsearch_client (host , api_key = None , username = None , password = None ):
203180 """Create Elasticsearch client with optional API key or basic authentication."""
204181 if api_key and api_key .strip ():
@@ -327,10 +304,12 @@ def sarif_results_generator(sarif_files, index_name):
327304 2. Extracts each result from runs[].results[]
328305 3. Creates a separate Elasticsearch document per result
329306 4. Adds derived fields (ruleGroup, ruleLanguage) from ruleId parsing
330- 5. Includes run-level metadata and source file tracking
307+ 5. ONLY enriches with versionControlProvenance from run (minimal overhead)
308+ 6. Adds source file tracking metadata
331309
332- This approach allows for granular querying of individual code scanning findings
333- rather than treating entire SARIF files as single documents.
310+ This approach keeps document sizes minimal by ONLY indexing the result objects
311+ themselves plus minimal enrichment data, avoiding the overhead of tool info,
312+ automation details, and other run-level data.
334313 """
335314 from datetime import datetime
336315
@@ -366,15 +345,11 @@ def sarif_results_generator(sarif_files, index_name):
366345
367346 file_results_count += len (results )
368347
369- # Extract run-level metadata
370- run_metadata = {
371- "tool" : run .get ("tool" , {}),
372- "automationDetails" : run .get ("automationDetails" , {}),
373- "versionControlProvenance" : run .get ("versionControlProvenance" , []),
374- }
348+ # Extract ONLY versionControlProvenance from run (minimal enrichment)
349+ version_control_provenance = run .get ("versionControlProvenance" , [])
375350
376351 for result_index , result in enumerate (results ):
377- # Create a document that includes both the result and metadata
352+ # Create a document that includes ONLY the result fields
378353 document = dict (result ) # Copy all result fields
379354
380355 # Add derived fields from ruleId parsing
@@ -386,8 +361,9 @@ def sarif_results_generator(sarif_files, index_name):
386361 if rule_language :
387362 document ["ruleLanguage" ] = rule_language
388363
389- # Add run-level metadata
390- document ["run" ] = run_metadata
364+ # Add ONLY versionControlProvenance (not tool, automationDetails, etc.)
365+ if version_control_provenance :
366+ document ["versionControlProvenance" ] = version_control_provenance
391367
392368 # Add source file metadata
393369 document ["_sarif_source" ] = {
0 commit comments