From ac8053ca4bdd65079940ab85f5dcf99412f28cab Mon Sep 17 00:00:00 2001
From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 16:04:09 +0700
Subject: [PATCH 1/6] Split into files

---
 scripts/check-urls.py   | 149 ++++++++--------------------------------
 scripts/curl_wrapper.py |  86 +++++++++++++++++++++++
 scripts/url_checker.py  |  82 ++++++++++++++++++++++
 3 files changed, 196 insertions(+), 121 deletions(-)
 create mode 100644 scripts/curl_wrapper.py
 create mode 100644 scripts/url_checker.py

diff --git a/scripts/check-urls.py b/scripts/check-urls.py
index 846b6ee..421745b 100644
--- a/scripts/check-urls.py
+++ b/scripts/check-urls.py
@@ -1,17 +1,18 @@
 import contextlib
 import fileinput
+import signal
 import os
 import re
-import subprocess
 import sys
 import threading
-import time
 import typing
 import urllib.parse
-from queue import Queue, Empty
 
 from github_job_summary import JobSummary
 from subdomains import Subdomains
+from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
+from curl_wrapper import CurlWrapper
+from url_checker import UrlChecker
 
 """
 Read file names from stdin (feed from git ls-files)
@@ -20,33 +21,16 @@
 Check them with CURL
 """
 
-# To avoid 403 responses
-USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
-
-CONNECT_TIMEOUT_SEC = 5
-MAX_TIME_SEC = 10
 JOIN_TIMEOUT_SEC = 120
 
-
-class Curl:
-    """
-    See: https://curl.se/libcurl/c/libcurl-errors.html
-    """
-
-    CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
-    OK = 0
-    COULDNT_RESOLVE_HOST = 6
-    HTTP_RETURNED_ERROR = 22
-
-
 CURL_EXIT_CODES_AND_HTTP_CODES = {
-    "https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
-    "https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404),
-    "https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404),
-    "https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404),
-    "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
+    "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
+    "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
+    "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
+    "https://api.aspose.cloud/v4.0/": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
+    "https://id.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
     # TODO: Temporary fix
-    "https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404),
+    "https://dashboard.aspose.cloud/applications": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
 }
 
 REGEX_TO_IGNORE: list[re.Pattern[str]] = [
@@ -170,59 +154,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
                     raise
 
 
-class Task:
-    _proc: subprocess.Popen[bytes]
-    _stderr: str | None
-
-    def __init__(self, url: str):
-        self.url = url
-        self._proc = subprocess.Popen(
-            [
-                "curl",
-                "-sSf",
-                "--output",
-                "-",
-                "--connect-timeout",
-                str(CONNECT_TIMEOUT_SEC),
-                "--max-time",
-                str(MAX_TIME_SEC),
-                "--user-agent",
-                USER_AGENT,
-                self.url,
-            ],
-            stdout=open(os.devnull, "w"),
-            stderr=subprocess.PIPE,
-        )
-        self._stderr = None
-        self._started = time.time()
-
-    @property
-    def running(self) -> bool:
-        return self._proc.poll() is None
-
-    @property
-    def ret_code(self) -> int:
-        assert not self.running
-        return self._proc.returncode
-
-    @property
-    def stderr(self) -> str:
-        assert not self.running
-        if self._stderr is None:
-            self._stderr = self._proc.stderr.read().decode()
-        return self._stderr
-
-    @property
-    def age(self) -> float:
-        return time.time() - self._started
-
-
-def create_new_task(url: str) -> Task:
-    # print("Create task:", url)
-    return Task(url)
-
-
-def process_finished_task(task: Task) -> None:
+def process_finished_task(task) -> None:
     # print("Finish task:", task.url)
     expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
     if task.ret_code == 0 or task.ret_code == expected_ret_code:
@@ -230,9 +162,9 @@ def process_finished_task(task: Task) -> None:
         JOB_SUMMARY.add_success(task.url)
         return
 
-    if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code:
+    if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
         # Try parse stderr for HTTP code
-        match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr)
+        match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
         assert match, "Unexpected output: %s" % task.stderr
         http_code = int(match.groupdict()["http_code"])
         if http_code == expected_http_code:
@@ -247,56 +179,31 @@ def process_finished_task(task: Task) -> None:
     JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
 
 
-WORKER_QUEUE: Queue[str | None] = Queue()
-
-
-def url_checker(num_workers: int = 8) -> None:
-    next_report_age_sec = 5
-    workers: list[Task | None] = [None for _ in range(num_workers)]
-
-    queue_is_empty = False
-
-    while not queue_is_empty or any(workers):
-        for i, task in enumerate(workers):
-            if task is None:
-                continue
-            if not task.running:
-                process_finished_task(task)
-                workers[i] = None
-            elif task.age > next_report_age_sec:
-                print("Long request: '%s' %.2fs" % (task.url, task.age))
-                next_report_age_sec += 3
-
-        if not queue_is_empty:
-            for i in (i for (i, w) in enumerate(workers) if w is None):
-                # Avoid blocking forever if the queue is currently empty
-                try:
-                    item = WORKER_QUEUE.get_nowait()
-                except Empty:
-                    break
-                if item is None:
-                    queue_is_empty = True
-                    print("--- url queue is over ---")
-                    break
-                url = item
-                workers[i] = create_new_task(url)
-        time.sleep(0.2)
-    print("Worker finished")
-
-
 JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
 JOB_SUMMARY.add_header("Test all URLs")
 
 
 def main(files: list[str]) -> int:
-    checker = threading.Thread(target=url_checker, daemon=True)
+    url_checker = UrlChecker(
+        on_finish=process_finished_task,
+    )
+
+    # Setup signal handlers for graceful shutdown
+    def _handle_signal(_sig: int, _frame: typing.Any) -> None:
+        url_checker.stop()
+
+    with contextlib.suppress(Exception):
+        signal.signal(signal.SIGINT, _handle_signal)
+        signal.signal(signal.SIGTERM, _handle_signal)
+
+    checker = threading.Thread(target=url_checker.run, daemon=True)
     checker.start()
 
     for filename, text in text_extractor(files):
         for url in url_extractor(text, filename):
             # print("In:", url)
-            WORKER_QUEUE.put_nowait(url)
-    WORKER_QUEUE.put_nowait(None)
+            url_checker.add_url(url)
+    url_checker.close()
     checker.join(timeout=JOIN_TIMEOUT_SEC)
     if checker.is_alive():
         print(
diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py
new file mode 100644
index 0000000..336a093
--- /dev/null
+++ b/scripts/curl_wrapper.py
@@ -0,0 +1,86 @@
+import contextlib
+import os
+import re
+import subprocess
+import time
+from typing import Optional
+
+# To avoid 403 responses (default); caller may override per instance
+DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
+
+
+class EXIT_CODES:
+    OK = 0
+    COULDNT_RESOLVE_HOST = 6
+    HTTP_RETURNED_ERROR = 22
+
+
+class CurlWrapper:
+    """
+    Encapsulates a single curl execution with timeouts and helpers.
+    See: https://curl.se/libcurl/c/libcurl-errors.html
+    """
+
+    CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
+
+    def __init__(
+        self,
+        url: str,
+        *,
+        user_agent: str = DEFAULT_USER_AGENT,
+        connect_timeout: int = 5,
+        max_time: int = 10,
+    ) -> None:
+        self.url = url
+        self._stderr: Optional[str] = None
+        self._started = time.time()
+        self._proc = subprocess.Popen(
+            [
+                "curl",
+                "-sSf",
+                "--output",
+                "-",
+                "--connect-timeout",
+                str(connect_timeout),
+                "--max-time",
+                str(max_time),
+                "--user-agent",
+                user_agent,
+                self.url,
+            ],
+            stdout=open(os.devnull, "w"),
+            stderr=subprocess.PIPE,
+        )
+
+    @property
+    def running(self) -> bool:
+        return self._proc.poll() is None
+
+    @property
+    def ret_code(self) -> int:
+        assert not self.running
+        return self._proc.returncode
+
+    @property
+    def stderr(self) -> str:
+        assert not self.running
+        if self._stderr is None:
+            assert self._proc.stderr is not None
+            self._stderr = self._proc.stderr.read().decode()
+        return self._stderr
+
+    @property
+    def age(self) -> float:
+        return time.time() - self._started
+
+    def terminate(self, timeout: float | None = None) -> None:
+        try:
+            self._proc.terminate()
+            if timeout is not None:
+                self._proc.wait(timeout=timeout)
+        except Exception:
+            pass
+
+    def kill(self) -> None:
+        with contextlib.suppress(Exception):
+            self._proc.kill()
diff --git a/scripts/url_checker.py b/scripts/url_checker.py
new file mode 100644
index 0000000..559e0fe
--- /dev/null
+++ b/scripts/url_checker.py
@@ -0,0 +1,82 @@
+import contextlib
+import time
+from queue import Queue, Empty
+from typing import Callable, Optional
+
+from curl_wrapper import CurlWrapper
+
+
+class UrlChecker:
+    def __init__(
+        self,
+        *,
+        num_workers: int = 8,
+        hard_kill_sec: int = 15,
+        on_finish: Optional[Callable[[CurlWrapper], None]] = None,
+        worker_factory: Optional[Callable[[str], CurlWrapper]] = None,
+    ) -> None:
+        self.num_workers = num_workers
+        self.hard_kill_sec = hard_kill_sec
+        self.on_finish = on_finish
+        self.worker_factory = worker_factory or (lambda url: CurlWrapper(url))
+
+        self.queue: Queue[str | None] = Queue()
+        self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)]
+        self.stop_event = False
+        self.next_report_age_sec = 5
+
+    def add_url(self, url: str) -> None:
+        self.queue.put_nowait(url)
+
+    def close(self) -> None:
+        self.queue.put_nowait(None)
+
+    def stop(self) -> None:
+        self.stop_event = True
+        with contextlib.suppress(Exception):
+            self.queue.put_nowait(None)
+
+    def run(self) -> None:
+        queue_is_empty = False
+        while not queue_is_empty or any(self.workers):
+            # Graceful stop: cancel running curls
+            if self.stop_event:
+                queue_is_empty = True
+                for t in self.workers:
+                    if t is not None and t.running:
+                        t.terminate(timeout=1)
+                        if t.running:
+                            t.kill()
+
+            # Tick workers
+            for i, task in enumerate(self.workers):
+                if task is None:
+                    continue
+                if not task.running:
+                    if self.on_finish is not None:
+                        self.on_finish(task)
+                    self.workers[i] = None
+                elif task.age > self.next_report_age_sec:
+                    print("Long request: '%s' %.2fs" % (task.url, task.age))
+                    self.next_report_age_sec += 3
+                    if task.age > self.hard_kill_sec:
+                        task.terminate(timeout=2)
+                        if task.running:
+                            task.kill()
+                        print("Killed long request: '%s' %.2fs" % (task.url, task.age))
+
+            # Fill idle workers
+            if not queue_is_empty:
+                for i in (i for (i, w) in enumerate(self.workers) if w is None):
+                    try:
+                        item = self.queue.get_nowait()
+                    except Empty:
+                        break
+                    if item is None:
+                        queue_is_empty = True
+                        print("--- url queue is over ---")
+                        break
+                    url = item
+                    self.workers[i] = self.worker_factory(url)
+            time.sleep(0.2)
+        print("Worker finished")

From 6a4f576a6982b6692adac9071f613a59159bf3c6 Mon Sep 17 00:00:00 2001
From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:23:20 +0700
Subject: [PATCH 2/6] Refactoring

---
 scripts/check-urls.py  | 52 +++++++++++---------------------
 scripts/url_checker.py | 67 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/scripts/check-urls.py b/scripts/check-urls.py
index 421745b..96bde0b 100644
--- a/scripts/check-urls.py
+++ b/scripts/check-urls.py
@@ -11,7 +11,6 @@
 from github_job_summary import JobSummary
 from subdomains import Subdomains
 from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
-from curl_wrapper import CurlWrapper
 from url_checker import UrlChecker
 
 """
@@ -21,9 +20,9 @@
 Check them with CURL
 """
 
-JOIN_TIMEOUT_SEC = 120
+JOIN_TIMEOUT_SEC: int = 120
 
-CURL_EXIT_CODES_AND_HTTP_CODES = {
+CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = {
     "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
     "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
     "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
@@ -45,7 +44,7 @@
     ]
 )
 
-IGNORE_DOMAINS = Subdomains(
+IGNORE_DOMAINS: Subdomains = Subdomains(
     [
         ".android.com",
         ".apache.org",
@@ -82,10 +81,10 @@
     ]
 )
 
-URL_END_CHARS = r",#\)\"'<>\*\s\\"
-URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
+URL_END_CHARS: str = r",#\)\"'<>\*\s\\"
+URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
 # print(URL_RE_PATTERN)
-EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
+EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE)
 
 # URL : [Files]
 EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
@@ -129,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]
             EXTRACTED_URLS_WITH_FILES[url].append(filename)
 
 
-FILES_TO_IGNORE = frozenset(
+FILES_TO_IGNORE: frozenset[str] = frozenset(
     [
         ".jar",
         ".jar",
@@ -154,38 +153,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
                     raise
 
 
-def process_finished_task(task) -> None:
-    # print("Finish task:", task.url)
-    expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
-    if task.ret_code == 0 or task.ret_code == expected_ret_code:
-        print("OK:", "'%s' %.2fs" % (task.url, task.age))
-        JOB_SUMMARY.add_success(task.url)
-        return
-
-    if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
-        # Try parse stderr for HTTP code
-        match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
-        assert match, "Unexpected output: %s" % task.stderr
-        http_code = int(match.groupdict()["http_code"])
-        if http_code == expected_http_code:
-            print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
-            JOB_SUMMARY.add_success(task.url)
-            return
-
-    print(
-        "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
-        file=sys.stderr,
-    )
-    JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
-
-
-JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
+JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
 JOB_SUMMARY.add_header("Test all URLs")
 
 
 def main(files: list[str]) -> int:
     url_checker = UrlChecker(
-        on_finish=process_finished_task,
+        expectations=CURL_EXIT_CODES_AND_HTTP_CODES,
     )
 
     # Setup signal handlers for graceful shutdown
@@ -212,6 +186,14 @@ def _handle_signal(_sig: int, _frame: typing.Any) -> None:
             flush=True,
         )
 
+    # Collect results and write summary
+    for res in url_checker.results:
+        if res.ok:
+            JOB_SUMMARY.add_success(res.url)
+        else:
+            files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
+            JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}")
+
     JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
     if JOB_SUMMARY.has_errors:
         print(JOB_SUMMARY, file=sys.stderr, flush=True)
diff --git a/scripts/url_checker.py b/scripts/url_checker.py
index 559e0fe..22ef367 100644
--- a/scripts/url_checker.py
+++ b/scripts/url_checker.py
@@ -1,9 +1,23 @@
 import contextlib
+import sys
 import time
+from dataclasses import dataclass
 from queue import Queue, Empty
 from typing import Callable, Optional
 
-from curl_wrapper import CurlWrapper
+from curl_wrapper import CurlWrapper, EXIT_CODES
+
+
+@dataclass
+class CheckResult:
+    url: str
+    ok: bool
+    ret_code: int
+    age: float
+    stderr: str
+    expected_ret_code: int
+    expected_http_code: int | None
+    http_code: int | None
 
 
 class UrlChecker:
@@ -12,18 +26,19 @@ def __init__(
         *,
         num_workers: int = 8,
         hard_kill_sec: int = 15,
-        on_finish: Optional[Callable[[CurlWrapper], None]] = None,
+        expectations: dict[str, tuple[int, int | None]] | None = None,
         worker_factory: Optional[Callable[[str], CurlWrapper]] = None,
     ) -> None:
         self.num_workers = num_workers
         self.hard_kill_sec = hard_kill_sec
-        self.on_finish = on_finish
+        self.expectations = expectations or {}
         self.worker_factory = worker_factory or (lambda url: CurlWrapper(url))
 
         self.queue: Queue[str | None] = Queue()
         self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)]
         self.stop_event = False
         self.next_report_age_sec = 5
+        self.results: list[CheckResult] = []
 
     def add_url(self, url: str) -> None:
         self.queue.put_nowait(url)
@@ -53,8 +68,7 @@ def run(self) -> None:
                 if task is None:
                     continue
                 if not task.running:
-                    if self.on_finish is not None:
-                        self.on_finish(task)
+                    self._process_finished(task)
                     self.workers[i] = None
                 elif task.age > self.next_report_age_sec:
                     print("Long request: '%s' %.2fs" % (task.url, task.age))
@@ -80,3 +94,46 @@ def run(self) -> None:
                     self.workers[i] = self.worker_factory(url)
             time.sleep(0.2)
         print("Worker finished")
+
+    def _process_finished(self, task: CurlWrapper) -> None:
+        expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None))
+
+        ok: bool = False
+        http_code_val: int | None = None
+        stderr_out: str = task.stderr
+
+        # Fast path: exact expected ret code or success
+        if task.ret_code == 0 or task.ret_code == expected_ret_code:
+            print("OK:", "'%s' %.2fs" % (task.url, task.age))
+            ok = True
+            stderr_out = ""
+        else:
+            # If curl reports HTTP error (22), attempt to parse HTTP code to compare
+            if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
+                match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
+                assert match, "Unexpected output: %s" % task.stderr
+                http_code_val = int(match.groupdict()["http_code"])
+                if http_code_val == expected_http_code:
+                    print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
+                    ok = True
+
+        if not ok:
+            # Otherwise, report error
+            print(
+                "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
+                file=sys.stderr,
+            )
+
+        # Append exactly once
+        self.results.append(
+            CheckResult(
+                url=task.url,
+                ok=ok,
+                ret_code=task.ret_code,
+                age=task.age,
+                stderr=stderr_out,
+                expected_ret_code=expected_ret_code,
+                expected_http_code=expected_http_code,
+                http_code=http_code_val,
+            )
+        )

From fed631aa86b9fa42d4d8f4031884318542138824 Mon Sep 17 00:00:00 2001
From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 19:24:19 +0700
Subject: [PATCH 3/6] Using context

---
 scripts/check-urls.py  | 47 +++++++++++++--------------------------
 scripts/url_checker.py | 50 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/scripts/check-urls.py b/scripts/check-urls.py
index 96bde0b..cac15c1 100644
--- a/scripts/check-urls.py
+++ b/scripts/check-urls.py
@@ -20,7 +20,7 @@
 Check them with CURL
 """
 
-JOIN_TIMEOUT_SEC: int = 120
+JOIN_TIMEOUT_SEC = 120
 
 CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = {
     "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
@@ -36,7 +36,7 @@
     re.compile(r"^https://github\.com/(?P<user>[^/]+)/(?P<repo>[^/]+)/(?:blob|issues)/\S+$"),
 ]
 
-URLS_TO_IGNORE: frozenset[str] = frozenset(
+URLS_TO_IGNORE = frozenset(
     [
         "https://api.aspose.cloud",
         "https://www.aspose.cloud/404",
@@ -44,7 +44,7 @@
     ]
 )
 
-IGNORE_DOMAINS: Subdomains = Subdomains(
+IGNORE_DOMAINS = Subdomains(
     [
         ".android.com",
         ".apache.org",
@@ -81,10 +81,10 @@
     ]
 )
 
-URL_END_CHARS: str = r",#\)\"'<>\*\s\\"
-URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
+URL_END_CHARS = r",#\)\"'<>\*\s\\"
+URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
 # print(URL_RE_PATTERN)
-EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE)
+EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
 
 # URL : [Files]
 EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
@@ -128,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]
             EXTRACTED_URLS_WITH_FILES[url].append(filename)
 
 
-FILES_TO_IGNORE: frozenset[str] = frozenset(
+FILES_TO_IGNORE = frozenset(
     [
         ".jar",
         ".jar",
@@ -153,7 +153,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
                     raise
 
 
-JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
+JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
 JOB_SUMMARY.add_header("Test all URLs")
 
 
@@ -162,32 +162,15 @@ def main(files: list[str]) -> int:
         expectations=CURL_EXIT_CODES_AND_HTTP_CODES,
     )
 
-    # Setup signal handlers for graceful shutdown
-    def _handle_signal(_sig: int, _frame: typing.Any) -> None:
-        url_checker.stop()
-
-    with contextlib.suppress(Exception):
-        signal.signal(signal.SIGINT, _handle_signal)
-        signal.signal(signal.SIGTERM, _handle_signal)
-
-    checker = threading.Thread(target=url_checker.run, daemon=True)
-    checker.start()
-
-    for filename, text in text_extractor(files):
-        for url in url_extractor(text, filename):
-            # print("In:", url)
-            url_checker.add_url(url)
-    url_checker.close()
-    checker.join(timeout=JOIN_TIMEOUT_SEC)
-    if checker.is_alive():
-        print(
-            f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.",
-            file=sys.stderr,
-            flush=True,
-        )
+    with url_checker.start() as checker:
+        for filename, text in text_extractor(files):
+            for url in url_extractor(text, filename):
+                checker.add_url(url)
+        checker.wait(JOIN_TIMEOUT_SEC)
+    results = url_checker.results
 
     # Collect results and write summary
-    for res in url_checker.results:
+    for res in results:
         if res.ok:
             JOB_SUMMARY.add_success(res.url)
         else:
diff --git a/scripts/url_checker.py b/scripts/url_checker.py
index 22ef367..91cde7a 100644
--- a/scripts/url_checker.py
+++ b/scripts/url_checker.py
@@ -3,7 +3,9 @@
 import time
 from dataclasses import dataclass
 from queue import Queue, Empty
-from typing import Callable, Optional
+from typing import Callable, Optional, Iterable
+from types import TracebackType
+import threading
 
 from curl_wrapper import CurlWrapper, EXIT_CODES
 
@@ -39,12 +41,16 @@ def __init__(
         self.stop_event = False
         self.next_report_age_sec = 5
         self.results: list[CheckResult] = []
+        self._thread: threading.Thread | None = None
+        self._closed: bool = False
 
     def add_url(self, url: str) -> None:
         self.queue.put_nowait(url)
 
     def close(self) -> None:
-        self.queue.put_nowait(None)
+        if not self._closed:
+            self._closed = True
+            self.queue.put_nowait(None)
 
     def stop(self) -> None:
         self.stop_event = True
@@ -95,6 +101,46 @@ def run(self) -> None:
             time.sleep(0.2)
         print("Worker finished")
 
+    # Context management and user-friendly API
+    def start(self) -> "UrlChecker":
+        if self._thread is not None:
+            return self
+        self._thread = threading.Thread(target=self.run, daemon=True)
+        self._thread.start()
+        return self
+
+    def __enter__(self) -> "UrlChecker":
+        return self.start()
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
+    ) -> None:
+        # Ensure we signal end of input and wait for completion
+        self.close()
+        self.wait()
+
+    def wait(self, join_timeout_sec: float | None = None) -> None:
+        # Ensure end-of-input signaled before waiting
+        self.close()
+        t = self._thread
+        if t is None:
+            return
+        if join_timeout_sec is not None:
+            t.join(timeout=join_timeout_sec)
+            if t.is_alive():
+                # Try to stop gracefully and inform user
+                self.stop()
+                print(
+                    f"URL checker did not finish within {join_timeout_sec}s; exiting early.",
+                    file=sys.stderr,
+                    flush=True,
+                )
+        else:
+            t.join()
+
     def _process_finished(self, task: CurlWrapper) -> None:
         expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None))
 

From 5351dd28d5e27b406ade5844d929b0369909a548 Mon Sep 17 00:00:00 2001
From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com>
Date: Thu, 16 Oct 2025 19:37:38 +0700
Subject: [PATCH 4/6] Renames

---
 scripts/check-urls.py   | 24 +++++++++++-------------
 scripts/curl_wrapper.py |  7 +++++--
 scripts/url_checker.py  | 22 +++++++++++-----------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/scripts/check-urls.py b/scripts/check-urls.py
index cac15c1..00beed9 100644
--- a/scripts/check-urls.py
+++ b/scripts/check-urls.py
@@ -1,16 +1,14 @@
 import contextlib
 import fileinput
-import signal
 import os
 import re
 import sys
-import threading
 import typing
 import urllib.parse
 
 from github_job_summary import JobSummary
 from subdomains import Subdomains
-from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
+from curl_wrapper import CurlExitCodes
 from url_checker import UrlChecker
 
 """
@@ -22,14 +20,14 @@
 
 JOIN_TIMEOUT_SEC = 120
 
-CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = {
-    "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
-    "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
-    "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
-    "https://api.aspose.cloud/v4.0/": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
-    "https://id.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
+EXIT_CODE_EXPECTATIONS: dict[str, tuple[int, int | None]] = {
+    "https://api.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400),
+    "https://api.aspose.cloud/v3.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
+    "https://api.aspose.cloud/v4.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
+    "https://api.aspose.cloud/v4.0/": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
+    "https://id.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400),
     # TODO: Temporary fix
-    "https://dashboard.aspose.cloud/applications": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
+    "https://dashboard.aspose.cloud/applications": (CurlExitCodes.HTTP_RETURNED_ERROR, 404),
 }
 
 REGEX_TO_IGNORE: list[re.Pattern[str]] = [
@@ -159,7 +157,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
 
 def main(files: list[str]) -> int:
     url_checker = UrlChecker(
-        expectations=CURL_EXIT_CODES_AND_HTTP_CODES,
+        expectations=EXIT_CODE_EXPECTATIONS,
     )
 
     with url_checker.start() as checker:
@@ -174,8 +172,8 @@ def main(files: list[str]) -> int:
         if res.ok:
             JOB_SUMMARY.add_success(res.url)
         else:
-            files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
-            JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}")
+            src_files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
+            JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {src_files}")
 
     JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
     if JOB_SUMMARY.has_errors:
diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py
index 336a093..a11734e 100644
--- a/scripts/curl_wrapper.py
+++ b/scripts/curl_wrapper.py
@@ -9,7 +9,11 @@
 DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
 
 
-class EXIT_CODES:
+class CurlExitCodes:
+    """
+    See: https://curl.se/libcurl/c/libcurl-errors.html
+    """
+
     OK = 0
     COULDNT_RESOLVE_HOST = 6
     HTTP_RETURNED_ERROR = 22
@@ -18,7 +22,6 @@ class EXIT_CODES:
 class CurlWrapper:
     """
     Encapsulates a single curl execution with timeouts and helpers.
-    See: https://curl.se/libcurl/c/libcurl-errors.html
     """
 
     CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
diff --git a/scripts/url_checker.py b/scripts/url_checker.py
index 91cde7a..697a453 100644
--- a/scripts/url_checker.py
+++ b/scripts/url_checker.py
@@ -1,13 +1,13 @@
 import contextlib
 import sys
+import threading
 import time
 from dataclasses import dataclass
 from queue import Queue, Empty
-from typing import Callable, Optional, Iterable
 from types import TracebackType
-import threading
+from typing import Callable, Optional
 
-from curl_wrapper import CurlWrapper, EXIT_CODES
+from curl_wrapper import CurlWrapper, CurlExitCodes
 
 
 @dataclass
@@ -47,17 +47,17 @@ def __init__(
     def add_url(self, url: str) -> None:
         self.queue.put_nowait(url)
 
-    def close(self) -> None:
+    def _close(self) -> None:
         if not self._closed:
             self._closed = True
             self.queue.put_nowait(None)
 
-    def stop(self) -> None:
+    def _stop(self) -> None:
         self.stop_event = True
         with contextlib.suppress(Exception):
             self.queue.put_nowait(None)
 
-    def run(self) -> None:
+    def _run(self) -> None:
         queue_is_empty = False
         while not queue_is_empty or any(self.workers):
             # Graceful stop: cancel running curls
@@ -105,7 +105,7 @@ def run(self) -> None:
     def start(self) -> "UrlChecker":
         if self._thread is not None:
             return self
-        self._thread = threading.Thread(target=self.run, daemon=True)
+        self._thread = threading.Thread(target=self._run, daemon=True)
         self._thread.start()
         return self
 
@@ -119,12 +119,12 @@ def __exit__(
         tb: TracebackType | None,
     ) -> None:
         # Ensure we signal end of input and wait for completion
-        self.close()
+        self._close()
         self.wait()
 
     def wait(self, join_timeout_sec: float | None = None) -> None:
         # Ensure end-of-input signaled before waiting
-        self.close()
+        self._close()
         t = self._thread
         if t is None:
             return
@@ -132,7 +132,7 @@ def wait(self, join_timeout_sec: float | None = None) -> None:
             t.join(timeout=join_timeout_sec)
             if t.is_alive():
                 # Try to stop gracefully and inform user
-                self.stop()
+                self._stop()
                 print(
                     f"URL checker did not finish within {join_timeout_sec}s; exiting early.",
                     file=sys.stderr,
@@ -155,7 +155,7 @@ def _process_finished(self, task: CurlWrapper) -> None:
             stderr_out = ""
         else:
             # If curl reports HTTP error (22), attempt to parse HTTP code to compare
-            if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
+            if task.ret_code == CurlExitCodes.HTTP_RETURNED_ERROR and expected_http_code:
                 match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
                 assert match, "Unexpected output: %s" % task.stderr
                 http_code_val = int(match.groupdict()["http_code"])

From a5e34049189653b89bedc8433497e34e1dec1636 Mon Sep 17 00:00:00 2001
From: Denis Averin <denis.averin@aspose.com>
Date: Thu, 30 Oct 2025 18:46:49 +0700
Subject: [PATCH 5/6] Code cleanup

---
 scripts/check_all_urls.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/check_all_urls.sh b/scripts/check_all_urls.sh
index c7b58e1..79f9f59 100755
--- a/scripts/check_all_urls.sh
+++ b/scripts/check_all_urls.sh
@@ -5,9 +5,6 @@ set -euo pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 ROOT_DIR="$( cd "${SCRIPT_DIR}/.." &> /dev/null && pwd )"
 
-check_file () {
-    echo "$1"
-}
 pushd "${ROOT_DIR}"
 git ls-files --recurse-submodules --exclude-standard --full-name | grep -v 'package-lock.json$' | python "${SCRIPT_DIR}/check-urls.py"
 popd

From 8601d4d770917176aa1872f0188dc8938b0f7b65 Mon Sep 17 00:00:00 2001
From: Denis Averin <denis.averin@aspose.com>
Date: Thu, 30 Oct 2025 23:03:49 +0700
Subject: [PATCH 6/6] Added max-redirects

---
 scripts/curl_wrapper.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py
index a11734e..95d01e7 100644
--- a/scripts/curl_wrapper.py
+++ b/scripts/curl_wrapper.py
@@ -33,6 +33,7 @@ def __init__(
         user_agent: str = DEFAULT_USER_AGENT,
         connect_timeout: int = 5,
         max_time: int = 10,
+        max_redirects: int = 3,
     ) -> None:
         self.url = url
         self._stderr: Optional[str] = None
@@ -41,14 +42,20 @@ def __init__(
             [
                 "curl",
                 "-sSf",
+                "-L",  # follow redirects
+                "--max-redirs",
+                f"{max_redirects}",  # limit number of redirects
+                # "--proto", "=https",  # (optional) only allow https for the initial URL
+                "--proto-redir",
+                "=all,https",  # only allow https after redirects; http will fail
                 "--output",
-                "-",
+                "-",  # discard body
                 "--connect-timeout",
-                str(connect_timeout),
+                f"{connect_timeout}",
                 "--max-time",
-                str(max_time),
+                f"{max_time}",
                 "--user-agent",
-                user_agent,
+                f"{user_agent}",
                 self.url,
             ],
             stdout=open(os.devnull, "w"),