Add auto-merge workflow

xarical · harlanenciso112 · xarical · commit 59cfb4dcf3f7 · 2025-06-21T12:40:12.000-07:00
- Add toxicity-check.yml workflow with check_toxicity.py script for automatic content moderation from Roshanjossey#108 (resolves Roshanjossey#27) - Refactor toxicity-check.yml and check_toxicity.py to use the GitHub CLI and use gemma-9b-it served by Groq (per Roshanjossey#27 (comment)) respectively - Rename toxicity-check.yml and check_toxicity.py to auto-pr-merge.yml and check_pr.py respectively Co-authored-by: harlanenciso112 <harsanenciso@gmail.com>
diff --git a/.github/workflows/auto-pr-merge.yml b/.github/workflows/auto-pr-merge.yml
@@ -0,0 +1,77 @@
+name: Auto-merge PRs
+on:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+    paths:
+      - "contributors/**" # Run if only contributors dir changed
+
+jobs:
+  auto-merge:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 2
+
+      - name: Get a list of files changed in the pull request
+        run: |
+          PR_FILES=$(gh pr view 2615 --json files --jq '.[].filename' ${{ github.event.pull_request.html_url }})
+          FILES_CHANGED=$(echo $PR_FILES | tr '\n' ' ')
+          echo "FILES_CHANGED=$FILES_CHANGED" >> $GITHUB_ENV
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check if PR modifies only contributors/{username}.html
+        id: only_contributors_check
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const username = context.payload.pull_request.user.login;
+            const expected = `contributors/${username}.html`;
+            const filesChanged = process.env.files_changed.trim();
+            console.log(`Comparing "${filesChanged}" to "${expected}"`);
+            const onlyContributors = filesChanged === expected;
+            core.setOutput('only_contributors', onlyContributors);
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.x"
+
+      - name: Install dependencies
+        run: pip install bs4 groq
+
+      - name: Check PR contents
+        env:
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+        run: python scripts/check_pr.py
+
+      - name: Auto-approve PR if not flagged
+        if: success() && steps.only_contributors_check.outputs.only_contributors == 'true'
+        run: gh pr merge --merge "${{ github.event.pull_request.html_url }}"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Create comment if not flagged
+        if: success() && steps.only_contributors_check.outputs.only_contributors == 'true'
+        run: echo "👏 You've successfully submitted a PR, and it's been automatically merged!" | tee comment.txt
+
+      - name: Create comment if flagged
+        if: failure()
+        run: echo "🚩 Your PR was flagged. Please review and appropriately modify your PR; if this flag is in error, wait for the maintainer to review it." | tee comment.txt
+
+      - name: Create comment if something other than contributors/{username}.html was modified
+        if: success() && steps.only_contributors_check.outputs.only_contributors != 'true'
+        run: echo "👏 You've successfully submitted a PR! It contains changes that require review by the maintainer before merging.\nFiles changed:\n$FILES_CHANGED" | tee comment.txt
+
+      - name: Post comment on PR
+        if: always()
+        uses: thollander/actions-comment-pull-request@v2
+        with:
+          filePath: comment.txt
diff --git a/scripts/check_pr.py b/scripts/check_pr.py
@@ -0,0 +1,61 @@
+import json
+import os
+import sys
+
+from bs4 import BeautifulSoup
+from groq import Groq
+
+
+client = Groq()
+MODEL = "gemma2-9b-it"
+SYSTEM_PROMPT = """\
+Determine whether or not the given string contains any offensive material. 
+Respond with true if the string contains any offensive material and false if it contains no offensive material.
+Respond in json format with a field \"reason\" set to an explanation and \"flag\" set to true or false.
+"""
+
+
+def text_is_toxic(text: str) -> bool:
+    """
+    Analyze toxicity of text using an LLM served by Groq
+    """
+    result = client.chat.completions.create(
+        model=MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": SYSTEM_PROMPT,
+            },
+            {
+                "role": "user",
+                "content": f"'''\n{text}\n'''", 
+            }
+        ],
+        temperature=0,
+        max_completion_tokens=128, # Limit output to ~512 characters
+        response_format={"type": "json_object"},
+    ).choices[0].message.content
+    print("[DEBUG] File content:", text.replace("\n", "\\n"))
+    print("[DEBUG] Model response:", result)
+    return json.loads(result)["flag"]
+
+
+def file_is_toxic(file_path: str) -> bool:
+    """
+    Analyze the file path and content for toxicity
+    """
+    with open(file_path) as file:
+        file_content = BeautifulSoup(
+            file.read(), # Read the HTML file
+            "html.parser",
+        ).get_text() # Extract the text from the HTML file
+    return text_is_toxic(file_path) or text_is_toxic(file_content)
+
+
+if __name__ == "__main__":
+    toxic = False
+    for file_path in os.popen("git diff --name-only HEAD^ HEAD").read().split(): # For each file in the diff,
+        if os.path.exists(file_path) and file_is_toxic(file_path): # Check it if it exists and is toxic
+            print(f"🚩 Flagged {file_path}")
+            toxic = True
+    sys.exit(1) if toxic else sys.exit(0) # Exit with a non-zero status code if toxic