1111results into a single Elasticsearch index. Each result document includes:
1212- All original SARIF result fields (ruleId, message, locations, etc.)
1313- Derived fields (ruleGroup, ruleLanguage) parsed from ruleId
14- - Run-level metadata (tool info, version control provenance )
14+ - ONLY versionControlProvenance from run (minimal enrichment )
1515- Source file tracking metadata
1616
17+ This approach keeps documents minimal by indexing ONLY the result objects to avoid
18+ Elasticsearch size limits. Tool info and automation details are NOT included.
19+
1720Usage:
1821 python index-sarif-results-in-elasticsearch.py <sarif_files_list.txt> <elasticsearch_index_name>
1922
@@ -398,7 +401,7 @@ def sarif_results_generator(sarif_files, index_name):
398401
399402def index_sarif_files (sarif_files , index_name , host , api_key = None , username = None , password = None ):
400403 """
401- Connect to Elasticsearch and bulk index all SARIF results.
404+ Connect to Elasticsearch and bulk index all SARIF results with progress logging .
402405 """
403406 es_client = create_elasticsearch_client (host , api_key , username , password )
404407
@@ -415,37 +418,58 @@ def index_sarif_files(sarif_files, index_name, host, api_key=None, username=None
415418 return False
416419
417420 print (f"Indexing results from { len (sarif_files )} SARIF files..." )
421+ print ()
418422
419423 try :
420- # Use bulk helper to index all documents
421- success_count , failed_docs = helpers .bulk (
424+ # Track progress during bulk indexing
425+ documents_indexed = 0
426+ last_progress_update = 0
427+ progress_interval = 100 # Update every 100 documents
428+
429+ def progress_callback (success , info ):
430+ """Callback to track progress during bulk indexing."""
431+ nonlocal documents_indexed , last_progress_update
432+ documents_indexed += 1
433+
434+ # Print progress updates periodically
435+ if documents_indexed - last_progress_update >= progress_interval :
436+ print (f" → Indexed { documents_indexed } documents so far..." )
437+ last_progress_update = documents_indexed
438+
439+ if not success :
440+ print (f" ✗ Failed to index document: { info } " )
441+
442+ # Use bulk helper to index all documents with progress tracking
443+ print ("Starting bulk indexing..." )
444+ for success , info in helpers .streaming_bulk (
422445 es_client ,
423446 sarif_results_generator (sarif_files , index_name ),
424447 chunk_size = 500 ,
425448 request_timeout = 60 ,
426- )
449+ raise_on_error = False ,
450+ ):
451+ progress_callback (success , info )
427452
453+ print (f" → Indexed { documents_indexed } documents (final)" )
454+ print ()
428455 print ("-" * 50 )
429456 print (f"✓ Bulk indexing complete" )
430- print (f"✓ Successfully indexed: { success_count } documents" )
431- print (f"✗ Failed to index: { len (failed_docs )} documents" )
432-
433- if failed_docs :
434- print ("\n Failed documents:" )
435- for doc in failed_docs [:5 ]: # Show first 5 failures
436- print (f" - { doc } " )
437- if len (failed_docs ) > 5 :
438- print (f" ... and { len (failed_docs ) - 5 } more" )
457+ print (f"✓ Total documents indexed: { documents_indexed } " )
439458
440- # Get final index stats
459+ # Get final index stats to verify
441460 stats = es_client .indices .stats (index = index_name )
442461 doc_count = stats ["indices" ][index_name ]["total" ]["docs" ]["count" ]
443462 print (f"✓ Final document count in index: { doc_count } " )
463+
464+ if doc_count != documents_indexed :
465+ print (f"⚠ Warning: Document count mismatch (indexed: { documents_indexed } , in index: { doc_count } )" )
444466
445467 return True
446468
447469 except Exception as e :
448470 print (f"Error during bulk indexing: { e } " )
471+ import traceback
472+ traceback .print_exc ()
449473 return False
450474
451475
0 commit comments