From dcd8439449c71b1dd6cc9386737e7da0eff34fbb Mon Sep 17 00:00:00 2001 From: DTrejo <56119+DTrejo@users.noreply.github.com> Date: Tue, 26 Aug 2025 15:27:58 -0700 Subject: [PATCH 1/4] add Claude.md --- CLAUDE.md | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f9f6169 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,152 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a Python-based tool that analyzes GitHub Advanced Security (GHAS) license utilization within GitHub organizations and enterprises. The tool helps optimize GHAS coverage by identifying repositories that can be enabled without consuming additional licenses and finding optimal combinations of repositories to maximize coverage with available licenses. + +## Development Commands + +### Environment Setup +```bash +# Create and activate virtual environment +python3 -m venv venv +source venv/bin/activate + +# Install dependencies +pip3 install -r requirements.txt + +# Set GitHub token +export GITHUB_TOKEN= +``` + +### Running the Application +```bash +# Basic usage with organization +python3 main.py --organization ORG_NAME + +# With active committers CSV report (preferred method) +python3 main.py --ac-report report.csv --organization ORG_NAME + +# With available licenses for optimization +python3 main.py --organization ORG_NAME --licenses 10 + +# Enterprise-wide analysis +python3 main.py --enterprise ENTERPRISE_NAME --licenses 100 + +# Output formats +python3 main.py --organization ORG_NAME --output-format json --output report.json +python3 main.py --organization ORG_NAME --output-format text --output report.md +``` + +### Testing +```bash +# Run all tests with custom test runner +python3 -m unittest discover tests/ -v + +# Run specific test files +python3 tests/test_max_coverage.py +python3 tests/test_active_committers.py + +# The tests use a custom colorized test runner defined in tests/custom_test_runner.py +``` + +### Code Quality +The repository has GitHub Actions configured for: +- CodeQL security analysis (runs on push/PR to main) +- Dependency review (runs on PRs) + +## Code Architecture + +### Core Module Structure + +**main.py** +- Entry point that orchestrates the entire workflow +- Parses CLI arguments, coordinates data gathering, and generates reports + +**github.py** - GitHub API Integration +- `add_active_committers()`: Processes CSV reports or fetches via GraphQL API +- `get_ghas_status_for_repos()`: Retrieves repository GHAS status via REST API +- `get_active_committers_in_last_90_days()`: GraphQL queries for commit history +- `get_orgs_in_ent()`: Enterprise organization discovery +- `handle_rate_limit()`: Intelligent rate limiting with exponential backoff +- Uses ThreadPoolExecutor for concurrent API calls (MAX_WORKERS = 5) + +**models.py** - Data Models +- `Repository`: Core data structure with GHAS status, visibility, active committers +- `Report`: Comprehensive report structure with calculated properties for coverage metrics + +**report.py** - Analysis Engine +- `generate_ghas_coverage_report()`: Categorizes repositories based on GHAS status and committers +- `find_combination_with_max_repositories_greedy()`: Optimization algorithm for license allocation +- `find_combination_with_max_repositories()`: Brute-force algorithm for smaller datasets +- `write_report()`: Outputs text or JSON formatted reports + +**helpers.py** - Utilities +- Argument parsing with validation +- Logging configuration +- Environment variable handling + +### Key Business Logic + +The tool categorizes repositories into distinct groups: + +1. **Current GHAS Repos**: Already have GHAS enabled +2. **Free Activation Candidates**: + - Repos with committers already consuming GHAS licenses + - Public repositories (no license required) + - Private/internal repos without active committers +3. **License-Required Repos**: Private repos with new active committers + +The optimization algorithm uses a greedy approach to maximize repository coverage within license constraints, prioritizing repositories with the most unique new committers. + +### Data Flow + +1. **Discovery**: Fetch organizations → repositories → GHAS status +2. **Committer Analysis**: Process CSV report or query GraphQL API for active committers (90-day window) +3. **Categorization**: Group repositories by activation requirements +4. **Optimization**: Find optimal combinations for available licenses +5. **Reporting**: Generate text/JSON output with coverage metrics + +### Rate Limiting Strategy + +The GitHub API integration implements sophisticated rate limiting: +- Monitors `X-RateLimit-Remaining` headers +- Proactive slowdown when <50 requests remaining +- Thread-safe rate limit handling with locks +- Respects `Retry-After` headers and reset timestamps +- 1-second base delay between all requests + +### Testing Architecture + +- Custom test runner with colorized output and execution timing +- Unit tests for the optimization algorithms with randomized test data +- Tests focus on the core business logic rather than API integration +- Uses `unittest` framework with custom `CustomTextTestRunner` + +## Key Configuration + +### Required Permissions +GitHub Personal Access Token needs: +- `repo` scope for repository access +- `admin:org` for organization-level operations +- `admin:enterprise` for enterprise-level operations + +### API Usage Patterns +- REST API for repository metadata and GHAS status +- GraphQL API for active committer analysis (when CSV unavailable) +- Prefers CSV reports over GraphQL for performance and reliability +- Excludes `dependabot[bot]` from active committer counts +- Only analyzes private/internal repositories for committer data (public repos don't require licenses) + +## Output Formats + +The tool generates detailed reports showing: +- Current coverage metrics and active committer counts +- Repositories eligible for free GHAS activation +- Optimal repository combinations for available licenses +- New committers that would consume additional licenses +- Final coverage projections + +Reports available in both human-readable text format and machine-parseable JSON for automation integration. \ No newline at end of file From 2c48a0f1e1855fa7e8ff1eeb835ea461b6895f8c Mon Sep 17 00:00:00 2001 From: DTrejo <56119+DTrejo@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:14:56 -0700 Subject: [PATCH 2/4] skip archived repos --- CLAUDE.md | 8 +- github.py | 7 +- models.py | 16 +++- tests/test_archived_repos.py | 178 +++++++++++++++++++++++++++++++++++ 4 files changed, 202 insertions(+), 7 deletions(-) create mode 100644 tests/test_archived_repos.py diff --git a/CLAUDE.md b/CLAUDE.md index f9f6169..0dca2cf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -67,14 +67,14 @@ The repository has GitHub Actions configured for: **github.py** - GitHub API Integration - `add_active_committers()`: Processes CSV reports or fetches via GraphQL API -- `get_ghas_status_for_repos()`: Retrieves repository GHAS status via REST API +- `get_ghas_status_for_repos()`: Retrieves repository GHAS status via REST API and automatically filters out archived repositories - `get_active_committers_in_last_90_days()`: GraphQL queries for commit history - `get_orgs_in_ent()`: Enterprise organization discovery - `handle_rate_limit()`: Intelligent rate limiting with exponential backoff - Uses ThreadPoolExecutor for concurrent API calls (MAX_WORKERS = 5) **models.py** - Data Models -- `Repository`: Core data structure with GHAS status, visibility, active committers +- `Repository`: Core data structure with GHAS status, visibility, archived status, and active committers - `Report`: Comprehensive report structure with calculated properties for coverage metrics **report.py** - Analysis Engine @@ -103,7 +103,7 @@ The optimization algorithm uses a greedy approach to maximize repository coverag ### Data Flow -1. **Discovery**: Fetch organizations → repositories → GHAS status +1. **Discovery**: Fetch organizations → repositories → GHAS status (archived repositories automatically filtered out) 2. **Committer Analysis**: Process CSV report or query GraphQL API for active committers (90-day window) 3. **Categorization**: Group repositories by activation requirements 4. **Optimization**: Find optimal combinations for available licenses @@ -149,4 +149,4 @@ The tool generates detailed reports showing: - New committers that would consume additional licenses - Final coverage projections -Reports available in both human-readable text format and machine-parseable JSON for automation integration. \ No newline at end of file +Reports available in both human-readable text format and machine-parseable JSON for automation integration. diff --git a/github.py b/github.py index b72b631..4341e6a 100644 --- a/github.py +++ b/github.py @@ -75,6 +75,11 @@ def get_ghas_status_for_repos(org, token): data = response.json() for repo_data in data: + # Always skip archived repositories + archived = repo_data.get("archived", False) + if archived: + continue + owner, name = repo_data["full_name"].split("/") ghas_status = ( repo_data.get("security_and_analysis", {}) @@ -84,7 +89,7 @@ def get_ghas_status_for_repos(org, token): ) visibility = repo_data["visibility"] pushed_at = repo_data["pushed_at"] - repo = Repository(name, owner, ghas_status, visibility, pushed_at) + repo = Repository(name, owner, ghas_status, visibility, pushed_at, archived) repos.append(repo) if "next" not in response.links: break diff --git a/models.py b/models.py index d5461c3..fc77b5d 100644 --- a/models.py +++ b/models.py @@ -1,11 +1,19 @@ class Repository: def __init__( - self, name, org, ghas_status, visibility, pushed_at, active_committers=None + self, + name, + org, + ghas_status, + visibility, + pushed_at, + archived=False, + active_committers=None, ): self.name = name self.org = org self.ghas_status = ghas_status self.pushed_at = pushed_at + self.archived = archived self.active_committers = ( active_committers if active_committers is not None else [] ) @@ -32,8 +40,11 @@ def get_full_name(self): def get_visibility(self): return self.visibility + def get_archived(self): + return self.archived + def __str__(self): - return f"Repository: {self.name} | GHAS Status: {self.ghas_status} | Visibility: {self.visibility} | Last Pushed At: {self.pushed_at} | Active Committers: {self.active_committers}" + return f"Repository: {self.name} | GHAS Status: {self.ghas_status} | Visibility: {self.visibility} | Archived: {self.archived} | Last Pushed At: {self.pushed_at} | Active Committers: {self.active_committers}" def to_dict(self): return { @@ -41,6 +52,7 @@ def to_dict(self): "org": self.org, "ghas_status": self.ghas_status, "visibility": self.visibility, + "archived": self.archived, "pushed_at": self.pushed_at, "active_committers": self.active_committers, } diff --git a/tests/test_archived_repos.py b/tests/test_archived_repos.py new file mode 100644 index 0000000..c23a6ad --- /dev/null +++ b/tests/test_archived_repos.py @@ -0,0 +1,178 @@ +import sys, os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + +import unittest +from unittest.mock import patch, MagicMock +from datetime import datetime +from custom_test_runner import CustomTextTestRunner +from models import Repository +from github import get_ghas_status_for_repos + + +class TestArchivedRepositoryFiltering(unittest.TestCase): + def test_repository_model_with_archived_field(self): + """Test that Repository model correctly handles archived field""" + # Test with archived=True + repo_archived = Repository( + name="archived-repo", + org="test-org", + ghas_status=False, + visibility="private", + pushed_at=datetime.now().isoformat(), + archived=True, + active_committers=["user1"], + ) + + self.assertTrue(repo_archived.get_archived()) + self.assertIn("Archived: True", str(repo_archived)) + self.assertTrue(repo_archived.to_dict()["archived"]) + + # Test with archived=False (default) + repo_active = Repository( + name="active-repo", + org="test-org", + ghas_status=True, + visibility="private", + pushed_at=datetime.now().isoformat(), + active_committers=["user1"], + ) + + self.assertFalse(repo_active.get_archived()) + self.assertIn("Archived: False", str(repo_active)) + self.assertFalse(repo_active.to_dict()["archived"]) + + @patch("github.requests.get") + def test_get_ghas_status_filters_archived_repos(self, mock_get): + """Test that get_ghas_status_for_repos filters out archived repositories""" + # Mock API response with mix of archived and non-archived repos + mock_response = MagicMock() + mock_response.json.return_value = [ + { + "full_name": "test-org/active-repo", + "archived": False, + "visibility": "private", + "pushed_at": "2023-11-22T11:39:41Z", + "security_and_analysis": {"advanced_security": {"status": "enabled"}}, + }, + { + "full_name": "test-org/archived-repo", + "archived": True, + "visibility": "private", + "pushed_at": "2023-01-15T10:20:30Z", + "security_and_analysis": {"advanced_security": {"status": "disabled"}}, + }, + { + "full_name": "test-org/another-active-repo", + "archived": False, + "visibility": "public", + "pushed_at": "2023-12-01T14:25:15Z", + "security_and_analysis": {"advanced_security": {"status": "disabled"}}, + }, + ] + mock_response.links = {} # No pagination + mock_response.headers = { + "X-RateLimit-Remaining": "100", + "X-RateLimit-Reset": "1234567890", + } + mock_get.return_value = mock_response + + # Call the function + repos = get_ghas_status_for_repos("test-org", "fake-token") + + # Verify only non-archived repos are returned + self.assertEqual(len(repos), 2) + repo_names = [repo.name for repo in repos] + self.assertIn("active-repo", repo_names) + self.assertIn("another-active-repo", repo_names) + self.assertNotIn("archived-repo", repo_names) + + # Verify all returned repos have archived=False + for repo in repos: + self.assertFalse(repo.get_archived()) + + @patch("github.requests.get") + def test_get_ghas_status_handles_missing_archived_field(self, mock_get): + """Test that get_ghas_status_for_repos handles missing archived field gracefully""" + # Mock API response without archived field (should default to False) + mock_response = MagicMock() + mock_response.json.return_value = [ + { + "full_name": "test-org/repo-without-archived-field", + # Note: no "archived" field + "visibility": "private", + "pushed_at": "2023-11-22T11:39:41Z", + "security_and_analysis": {"advanced_security": {"status": "enabled"}}, + } + ] + mock_response.links = {} # No pagination + mock_response.headers = { + "X-RateLimit-Remaining": "100", + "X-RateLimit-Reset": "1234567890", + } + mock_get.return_value = mock_response + + # Call the function + repos = get_ghas_status_for_repos("test-org", "fake-token") + + # Verify repo is included (archived defaults to False) + self.assertEqual(len(repos), 1) + self.assertEqual(repos[0].name, "repo-without-archived-field") + self.assertFalse(repos[0].get_archived()) + + @patch("github.requests.get") + def test_get_ghas_status_filters_all_archived_repos(self, mock_get): + """Test that get_ghas_status_for_repos returns empty list when all repos are archived""" + # Mock API response with only archived repos + mock_response = MagicMock() + mock_response.json.return_value = [ + { + "full_name": "test-org/archived-repo-1", + "archived": True, + "visibility": "private", + "pushed_at": "2023-01-15T10:20:30Z", + "security_and_analysis": {"advanced_security": {"status": "disabled"}}, + }, + { + "full_name": "test-org/archived-repo-2", + "archived": True, + "visibility": "public", + "pushed_at": "2023-02-20T15:30:45Z", + "security_and_analysis": {"advanced_security": {"status": "enabled"}}, + }, + ] + mock_response.links = {} # No pagination + mock_response.headers = { + "X-RateLimit-Remaining": "100", + "X-RateLimit-Reset": "1234567890", + } + mock_get.return_value = mock_response + + # Call the function + repos = get_ghas_status_for_repos("test-org", "fake-token") + + # Verify no repos are returned + self.assertEqual(len(repos), 0) + + def test_repository_model_backward_compatibility(self): + """Test that Repository model maintains backward compatibility""" + # Test creating repository without archived parameter (should default to False) + repo = Repository( + name="test-repo", + org="test-org", + ghas_status=True, + visibility="private", + pushed_at=datetime.now().isoformat(), + active_committers=["user1"], + ) + + self.assertFalse(repo.get_archived()) + self.assertIn("Archived: False", str(repo)) + self.assertFalse(repo.to_dict()["archived"]) + + +if __name__ == "__main__": + suite = unittest.defaultTestLoader.loadTestsFromTestCase( + TestArchivedRepositoryFiltering + ) + CustomTextTestRunner().run(suite) From caba1ce296db3d7b9322298d2e3412084543285d Mon Sep 17 00:00:00 2001 From: DTrejo <56119+DTrejo@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:28:27 -0700 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35ffa63..408bc88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## Unreleased - 2025-08-26 + +* Archived repositories are now excluded from GHAS analysis to improve speed and relevance. + ## 1.0.0 - 2024-01-26 * Initial open source release From 2c72a933a5d8850f7834754a5360a34b0a21bc76 Mon Sep 17 00:00:00 2001 From: DTrejo <56119+DTrejo@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:32:17 -0700 Subject: [PATCH 4/4] undo repository model changes --- CLAUDE.md | 2 +- github.py | 2 +- models.py | 16 ++--------- tests/test_archived_repos.py | 53 ++---------------------------------- 4 files changed, 7 insertions(+), 66 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0dca2cf..31778d9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -74,7 +74,7 @@ The repository has GitHub Actions configured for: - Uses ThreadPoolExecutor for concurrent API calls (MAX_WORKERS = 5) **models.py** - Data Models -- `Repository`: Core data structure with GHAS status, visibility, archived status, and active committers +- `Repository`: Core data structure with GHAS status, visibility, and active committers - `Report`: Comprehensive report structure with calculated properties for coverage metrics **report.py** - Analysis Engine diff --git a/github.py b/github.py index 4341e6a..705b3d6 100644 --- a/github.py +++ b/github.py @@ -89,7 +89,7 @@ def get_ghas_status_for_repos(org, token): ) visibility = repo_data["visibility"] pushed_at = repo_data["pushed_at"] - repo = Repository(name, owner, ghas_status, visibility, pushed_at, archived) + repo = Repository(name, owner, ghas_status, visibility, pushed_at) repos.append(repo) if "next" not in response.links: break diff --git a/models.py b/models.py index fc77b5d..d5461c3 100644 --- a/models.py +++ b/models.py @@ -1,19 +1,11 @@ class Repository: def __init__( - self, - name, - org, - ghas_status, - visibility, - pushed_at, - archived=False, - active_committers=None, + self, name, org, ghas_status, visibility, pushed_at, active_committers=None ): self.name = name self.org = org self.ghas_status = ghas_status self.pushed_at = pushed_at - self.archived = archived self.active_committers = ( active_committers if active_committers is not None else [] ) @@ -40,11 +32,8 @@ def get_full_name(self): def get_visibility(self): return self.visibility - def get_archived(self): - return self.archived - def __str__(self): - return f"Repository: {self.name} | GHAS Status: {self.ghas_status} | Visibility: {self.visibility} | Archived: {self.archived} | Last Pushed At: {self.pushed_at} | Active Committers: {self.active_committers}" + return f"Repository: {self.name} | GHAS Status: {self.ghas_status} | Visibility: {self.visibility} | Last Pushed At: {self.pushed_at} | Active Committers: {self.active_committers}" def to_dict(self): return { @@ -52,7 +41,6 @@ def to_dict(self): "org": self.org, "ghas_status": self.ghas_status, "visibility": self.visibility, - "archived": self.archived, "pushed_at": self.pushed_at, "active_committers": self.active_committers, } diff --git a/tests/test_archived_repos.py b/tests/test_archived_repos.py index c23a6ad..ec5ce5e 100644 --- a/tests/test_archived_repos.py +++ b/tests/test_archived_repos.py @@ -11,36 +11,6 @@ class TestArchivedRepositoryFiltering(unittest.TestCase): - def test_repository_model_with_archived_field(self): - """Test that Repository model correctly handles archived field""" - # Test with archived=True - repo_archived = Repository( - name="archived-repo", - org="test-org", - ghas_status=False, - visibility="private", - pushed_at=datetime.now().isoformat(), - archived=True, - active_committers=["user1"], - ) - - self.assertTrue(repo_archived.get_archived()) - self.assertIn("Archived: True", str(repo_archived)) - self.assertTrue(repo_archived.to_dict()["archived"]) - - # Test with archived=False (default) - repo_active = Repository( - name="active-repo", - org="test-org", - ghas_status=True, - visibility="private", - pushed_at=datetime.now().isoformat(), - active_committers=["user1"], - ) - - self.assertFalse(repo_active.get_archived()) - self.assertIn("Archived: False", str(repo_active)) - self.assertFalse(repo_active.to_dict()["archived"]) @patch("github.requests.get") def test_get_ghas_status_filters_archived_repos(self, mock_get): @@ -87,9 +57,9 @@ def test_get_ghas_status_filters_archived_repos(self, mock_get): self.assertIn("another-active-repo", repo_names) self.assertNotIn("archived-repo", repo_names) - # Verify all returned repos have archived=False + # Verify all returned repos are non-archived (they wouldn't be returned otherwise) for repo in repos: - self.assertFalse(repo.get_archived()) + self.assertIsNotNone(repo.name) @patch("github.requests.get") def test_get_ghas_status_handles_missing_archived_field(self, mock_get): @@ -115,10 +85,9 @@ def test_get_ghas_status_handles_missing_archived_field(self, mock_get): # Call the function repos = get_ghas_status_for_repos("test-org", "fake-token") - # Verify repo is included (archived defaults to False) + # Verify repo is included (archived defaults to False when missing) self.assertEqual(len(repos), 1) self.assertEqual(repos[0].name, "repo-without-archived-field") - self.assertFalse(repos[0].get_archived()) @patch("github.requests.get") def test_get_ghas_status_filters_all_archived_repos(self, mock_get): @@ -154,22 +123,6 @@ def test_get_ghas_status_filters_all_archived_repos(self, mock_get): # Verify no repos are returned self.assertEqual(len(repos), 0) - def test_repository_model_backward_compatibility(self): - """Test that Repository model maintains backward compatibility""" - # Test creating repository without archived parameter (should default to False) - repo = Repository( - name="test-repo", - org="test-org", - ghas_status=True, - visibility="private", - pushed_at=datetime.now().isoformat(), - active_committers=["user1"], - ) - - self.assertFalse(repo.get_archived()) - self.assertIn("Archived: False", str(repo)) - self.assertFalse(repo.to_dict()["archived"]) - if __name__ == "__main__": suite = unittest.defaultTestLoader.loadTestsFromTestCase(