Skip to content

Commit 42e6b4a

Browse files
committed
ES indexing should only use SARIF results
1 parent c182f91 commit 42e6b4a

File tree

1 file changed

+21
-45
lines changed

1 file changed

+21
-45
lines changed

scripts/es-sarif/index-sarif-results-in-elasticsearch.py

Lines changed: 21 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ def replace_var(match):
6464
DEFAULT_ELASTIC_HOST = "http://localhost:9200"
6565
SARIF_VERSION = "2.1.0"
6666

67-
# Elasticsearch mapping optimized for SARIF result documents
67+
# Elasticsearch mapping optimized for SARIF result documents
68+
# Minimal mapping - only results with versionControlProvenance enrichment
6869
SARIF_MAPPING = {
6970
"mappings": {
7071
"properties": {
@@ -149,37 +150,15 @@ def replace_var(match):
149150
"occurrenceCount": {"type": "integer"},
150151
"rank": {"type": "float"},
151152
"baselineState": {"type": "keyword"},
152-
# Run-level metadata (tool, repo info, etc.)
153-
"run": {
153+
# ONLY versionControlProvenance from run-level (minimal enrichment)
154+
"versionControlProvenance": {
155+
"type": "nested",
154156
"properties": {
155-
"tool": {
156-
"properties": {
157-
"driver": {
158-
"properties": {
159-
"name": {"type": "keyword"},
160-
"organization": {"type": "keyword"},
161-
"product": {"type": "keyword"},
162-
"version": {"type": "keyword"},
163-
"semanticVersion": {"type": "keyword"},
164-
}
165-
}
166-
}
167-
},
168-
"automationDetails": {
169-
"properties": {
170-
"id": {"type": "keyword"},
171-
"guid": {"type": "keyword"},
172-
"correlationGuid": {"type": "keyword"},
173-
}
174-
},
175-
"versionControlProvenance": {
176-
"type": "nested",
177-
"properties": {
178-
"repositoryUri": {"type": "keyword"},
179-
"revisionId": {"type": "keyword"},
180-
},
181-
},
182-
}
157+
"repositoryUri": {"type": "keyword"},
158+
"revisionId": {"type": "keyword"},
159+
"branch": {"type": "keyword"},
160+
"revisionTag": {"type": "keyword"},
161+
},
183162
},
184163
# Metadata for tracking source SARIF file
185164
"_sarif_source": {
@@ -197,8 +176,6 @@ def replace_var(match):
197176
"analysis": {"analyzer": {"sarif_text": {"type": "standard", "stopwords": "_none_"}}},
198177
},
199178
}
200-
201-
202179
def create_elasticsearch_client(host, api_key=None, username=None, password=None):
203180
"""Create Elasticsearch client with optional API key or basic authentication."""
204181
if api_key and api_key.strip():
@@ -327,10 +304,12 @@ def sarif_results_generator(sarif_files, index_name):
327304
2. Extracts each result from runs[].results[]
328305
3. Creates a separate Elasticsearch document per result
329306
4. Adds derived fields (ruleGroup, ruleLanguage) from ruleId parsing
330-
5. Includes run-level metadata and source file tracking
307+
5. ONLY enriches with versionControlProvenance from run (minimal overhead)
308+
6. Adds source file tracking metadata
331309
332-
This approach allows for granular querying of individual code scanning findings
333-
rather than treating entire SARIF files as single documents.
310+
This approach keeps document sizes minimal by ONLY indexing the result objects
311+
themselves plus minimal enrichment data, avoiding the overhead of tool info,
312+
automation details, and other run-level data.
334313
"""
335314
from datetime import datetime
336315

@@ -366,15 +345,11 @@ def sarif_results_generator(sarif_files, index_name):
366345

367346
file_results_count += len(results)
368347

369-
# Extract run-level metadata
370-
run_metadata = {
371-
"tool": run.get("tool", {}),
372-
"automationDetails": run.get("automationDetails", {}),
373-
"versionControlProvenance": run.get("versionControlProvenance", []),
374-
}
348+
# Extract ONLY versionControlProvenance from run (minimal enrichment)
349+
version_control_provenance = run.get("versionControlProvenance", [])
375350

376351
for result_index, result in enumerate(results):
377-
# Create a document that includes both the result and metadata
352+
# Create a document that includes ONLY the result fields
378353
document = dict(result) # Copy all result fields
379354

380355
# Add derived fields from ruleId parsing
@@ -386,8 +361,9 @@ def sarif_results_generator(sarif_files, index_name):
386361
if rule_language:
387362
document["ruleLanguage"] = rule_language
388363

389-
# Add run-level metadata
390-
document["run"] = run_metadata
364+
# Add ONLY versionControlProvenance (not tool, automationDetails, etc.)
365+
if version_control_provenance:
366+
document["versionControlProvenance"] = version_control_provenance
391367

392368
# Add source file metadata
393369
document["_sarif_source"] = {

0 commit comments

Comments
 (0)