diff --git a/get_repo_structure/get_repo_pull_requests.py b/get_repo_structure/get_repo_pull_requests.py new file mode 100644 index 00000000..6aa63ecc --- /dev/null +++ b/get_repo_structure/get_repo_pull_requests.py @@ -0,0 +1,215 @@ +import requests +import json +import time +import os + +TOKEN = "TOKEN" +HEADERS = {"Authorization": f"token {TOKEN}"} + +# Languages to include +LANGUAGES = ["Go", "Java", "JavaScript", "Python"] + +# How many repos per language to fetch +TOP_N_REPOS = 10 + +# Max PRs per repo +MAX_PRS = 100 +OUTPUT_FILE = "pull_requests_dataset.json" + + +def safe_get(d, *keys, default=None): + """Safely get nested dict values""" + for key in keys: + if d is None or key not in d: + return default + d = d[key] + return d + + +def get_top_repos(language, top_n=TOP_N_REPOS): + """Fetch top repositories by stars for a given language""" + url = f"https://api.github.com/search/repositories?q=language:{language}&sort=stars&order=desc&per_page={top_n}" + r = requests.get(url, headers=HEADERS) + repos = [] + if r.status_code == 200: + for item in r.json()["items"]: + repos.append({ + "name": item["full_name"], + "language": item["language"], + "stars": item["stargazers_count"], + "license": safe_get(item, "license", "spdx_id"), + "url": item["html_url"] + }) + else: + print(f"Error fetching repos for {language}: {r.status_code}") + return repos + + +def get_pull_requests(repo_full_name, max_prs=MAX_PRS): + """Fetch latest PRs for a repository""" + prs = [] + url = f"https://api.github.com/repos/{repo_full_name}/pulls?state=closed&per_page=100" + r = requests.get(url, headers=HEADERS) + if r.status_code != 200: + print(f"Error fetching PRs for {repo_full_name}: {r.status_code}") + return prs + + pr_list = r.json()[:max_prs] + for pr in pr_list: + pr_data = { + "id": pr.get("number"), + "title": pr.get("title"), + "body": pr.get("body"), + "author": safe_get(pr, "user", "login", default="ghost"), + "state": pr.get("state"), + "created_at": pr.get("created_at"), + "merged_at": pr.get("merged_at"), + "url": pr.get("html_url"), + "commits": [], + "comments": [] + } + + # Commits + commits_r = requests.get(pr.get("commits_url"), headers=HEADERS) + if commits_r.status_code == 200: + for commit in commits_r.json(): + pr_data["commits"].append({ + "sha": commit.get("sha"), + "message": safe_get(commit, "commit", "message"), + "author": safe_get(commit, "commit", "author", "name", default="unknown"), + "url": commit.get("html_url") + }) + + # Review comments + comments_url = safe_get(pr, "_links", "review_comments", "href") + if comments_url: + comments_r = requests.get(comments_url, headers=HEADERS) + if comments_r.status_code == 200: + for comment in comments_r.json(): + pr_data["comments"].append({ + "user": safe_get(comment, "user", "login", default="ghost"), + "body": comment.get("body"), + "created_at": comment.get("created_at"), + "url": comment.get("html_url"), + "line": comment.get("line"), + "position": comment.get("position"), + "commit_id": comment.get("commit_id"), + "type": "inline" if comment.get("path") else "general" + }) + + prs.append(pr_data) + time.sleep(0.5) # avoid spamming API + return prs + + +def build_dataset(output_file=OUTPUT_FILE): + """Build dataset of PRs from top repos with incremental saving""" + # Load existing dataset if present + dataset = [] + seen_repos = set() + if os.path.exists(output_file): + with open(output_file, "r", encoding="utf-8") as f: + dataset = json.load(f) + seen_repos = {repo["repo_name"] for repo in dataset} + + for lang in LANGUAGES: + print(f"Fetching top repos for {lang} ...") + top_repos = get_top_repos(lang) + for repo in top_repos: + # Check if repo is already in dataset + existing_repo = next((r for r in dataset if r["repo_name"] == repo["name"]), None) + + # Skip if it already has PRs with comments + if existing_repo and existing_repo.get("pull_requests"): + has_comments = any(pr["comments"] for pr in existing_repo["pull_requests"]) + if has_comments: + print(f"Skipping {repo['name']} (already has PRs with comments)") + continue + else: + print(f"Refetching {repo['name']} (previously empty or no comments)") + + print(f"Fetching PRs for {repo['name']} ...") + prs = get_pull_requests(repo["name"]) + repo_entry = { + "repo_name": repo["name"], + "language": repo["language"], + "stars": repo["stars"], + "license": repo["license"], + "url": repo["url"], + "pull_requests": prs + } + + if existing_repo: + dataset[dataset.index(existing_repo)] = repo_entry + else: + dataset.append(repo_entry) + + # Save progress immediately + with open(output_file, "w", encoding="utf-8") as f: + json.dump(dataset, f, indent=2, ensure_ascii=False) + + print(f"Saved dataset with {len(dataset)} repos to {output_file}") + + +def fetch_specific_repos(repo_list, output_file=OUTPUT_FILE): + """Fetch PRs for a list of specific repositories and update the dataset incrementally""" + # Load existing dataset if present + dataset = [] + if os.path.exists(output_file): + with open(output_file, "r", encoding="utf-8") as f: + dataset = json.load(f) + + for repo_full_name in repo_list: + existing_repo = next((r for r in dataset if r["repo_name"] == repo_full_name), None) + + # Skip if already has PRs with comments + if existing_repo and any(pr["comments"] for pr in existing_repo.get("pull_requests", [])): + print(f"Skipping {repo_full_name} (already has PRs with comments)") + continue + else: + print(f"Refetching {repo_full_name} (previously empty or no comments)") + + print(f"Fetching PRs for {repo_full_name} ...") + prs = get_pull_requests(repo_full_name) + + # Fetch repo info via GitHub API + r = requests.get(f"https://api.github.com/repos/{repo_full_name}", headers=HEADERS) + if r.status_code != 200: + print(f"Error fetching repo info for {repo_full_name}: {r.status_code}") + continue + repo_data = r.json() + + repo_entry = { + "repo_name": repo_full_name, + "language": repo_data.get("language"), + "stars": repo_data.get("stargazers_count"), + "license": safe_get(repo_data, "license", "spdx_id"), + "url": repo_data.get("html_url"), + "pull_requests": prs + } + + if existing_repo: + dataset[dataset.index(existing_repo)] = repo_entry + else: + dataset.append(repo_entry) + + # Save progress immediately + with open(output_file, "w", encoding="utf-8") as f: + json.dump(dataset, f, indent=2, ensure_ascii=False) + + print(f"Updated dataset with {len(dataset)} repos to {output_file}") + + +# ---------------------------- +# Run main dataset builder +# ---------------------------- +build_dataset() + +# ---------------------------- +# Fetch specific Atlassian repos +# ---------------------------- +SPECIFIC_REPOS = [ + "atlassian/atlascode", + "atlassian/dc-app-performance-toolkit" +] +fetch_specific_repos(SPECIFIC_REPOS)