Skip to content

Commit 1018933

Browse files
committed
Improve logging in scripts/es-sarif/index-sarif-*
1 parent 42e6b4a commit 1018933

File tree

1 file changed

+39
-15
lines changed

1 file changed

+39
-15
lines changed

scripts/es-sarif/index-sarif-results-in-elasticsearch.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
results into a single Elasticsearch index. Each result document includes:
1212
- All original SARIF result fields (ruleId, message, locations, etc.)
1313
- Derived fields (ruleGroup, ruleLanguage) parsed from ruleId
14-
- Run-level metadata (tool info, version control provenance)
14+
- ONLY versionControlProvenance from run (minimal enrichment)
1515
- Source file tracking metadata
1616
17+
This approach keeps documents minimal by indexing ONLY the result objects to avoid
18+
Elasticsearch size limits. Tool info and automation details are NOT included.
19+
1720
Usage:
1821
python index-sarif-results-in-elasticsearch.py <sarif_files_list.txt> <elasticsearch_index_name>
1922
@@ -398,7 +401,7 @@ def sarif_results_generator(sarif_files, index_name):
398401

399402
def index_sarif_files(sarif_files, index_name, host, api_key=None, username=None, password=None):
400403
"""
401-
Connect to Elasticsearch and bulk index all SARIF results.
404+
Connect to Elasticsearch and bulk index all SARIF results with progress logging.
402405
"""
403406
es_client = create_elasticsearch_client(host, api_key, username, password)
404407

@@ -415,37 +418,58 @@ def index_sarif_files(sarif_files, index_name, host, api_key=None, username=None
415418
return False
416419

417420
print(f"Indexing results from {len(sarif_files)} SARIF files...")
421+
print()
418422

419423
try:
420-
# Use bulk helper to index all documents
421-
success_count, failed_docs = helpers.bulk(
424+
# Track progress during bulk indexing
425+
documents_indexed = 0
426+
last_progress_update = 0
427+
progress_interval = 100 # Update every 100 documents
428+
429+
def progress_callback(success, info):
430+
"""Callback to track progress during bulk indexing."""
431+
nonlocal documents_indexed, last_progress_update
432+
documents_indexed += 1
433+
434+
# Print progress updates periodically
435+
if documents_indexed - last_progress_update >= progress_interval:
436+
print(f" → Indexed {documents_indexed} documents so far...")
437+
last_progress_update = documents_indexed
438+
439+
if not success:
440+
print(f" ✗ Failed to index document: {info}")
441+
442+
# Use bulk helper to index all documents with progress tracking
443+
print("Starting bulk indexing...")
444+
for success, info in helpers.streaming_bulk(
422445
es_client,
423446
sarif_results_generator(sarif_files, index_name),
424447
chunk_size=500,
425448
request_timeout=60,
426-
)
449+
raise_on_error=False,
450+
):
451+
progress_callback(success, info)
427452

453+
print(f" → Indexed {documents_indexed} documents (final)")
454+
print()
428455
print("-" * 50)
429456
print(f"✓ Bulk indexing complete")
430-
print(f"✓ Successfully indexed: {success_count} documents")
431-
print(f"✗ Failed to index: {len(failed_docs)} documents")
432-
433-
if failed_docs:
434-
print("\nFailed documents:")
435-
for doc in failed_docs[:5]: # Show first 5 failures
436-
print(f" - {doc}")
437-
if len(failed_docs) > 5:
438-
print(f" ... and {len(failed_docs) - 5} more")
457+
print(f"✓ Total documents indexed: {documents_indexed}")
439458

440-
# Get final index stats
459+
# Get final index stats to verify
441460
stats = es_client.indices.stats(index=index_name)
442461
doc_count = stats["indices"][index_name]["total"]["docs"]["count"]
443462
print(f"✓ Final document count in index: {doc_count}")
463+
464+
if doc_count != documents_indexed:
465+
print(f"⚠ Warning: Document count mismatch (indexed: {documents_indexed}, in index: {doc_count})")
444466

445467
return True
446468

447469
except Exception as e:
448470
print(f"Error during bulk indexing: {e}")
471+
import traceback
472+
traceback.print_exc()
449473
return False
450474

451475

0 commit comments

Comments
 (0)