From ac8053ca4bdd65079940ab85f5dcf99412f28cab Mon Sep 17 00:00:00 2001 From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com> Date: Thu, 16 Oct 2025 16:04:09 +0700 Subject: [PATCH 1/6] Split into files --- scripts/check-urls.py | 149 ++++++++-------------------------------- scripts/curl_wrapper.py | 86 +++++++++++++++++++++++ scripts/url_checker.py | 82 ++++++++++++++++++++++ 3 files changed, 196 insertions(+), 121 deletions(-) create mode 100644 scripts/curl_wrapper.py create mode 100644 scripts/url_checker.py diff --git a/scripts/check-urls.py b/scripts/check-urls.py index 846b6ee..421745b 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -1,17 +1,18 @@ import contextlib import fileinput +import signal import os import re -import subprocess import sys import threading -import time import typing import urllib.parse -from queue import Queue, Empty from github_job_summary import JobSummary from subdomains import Subdomains +from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES +from curl_wrapper import CurlWrapper +from url_checker import UrlChecker """ Read file names from stdin (feed from git ls-files) @@ -20,33 +21,16 @@ Check them with CURL """ -# To avoid 403 responses -USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" - -CONNECT_TIMEOUT_SEC = 5 -MAX_TIME_SEC = 10 JOIN_TIMEOUT_SEC = 120 - -class Curl: - """ - See: https://curl.se/libcurl/c/libcurl-errors.html - """ - - CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P\d+)") - OK = 0 - COULDNT_RESOLVE_HOST = 6 - HTTP_RETURNED_ERROR = 22 - - CURL_EXIT_CODES_AND_HTTP_CODES = { - "https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), - "https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404), - "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400), + "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), + "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0/": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), + "https://id.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), # TODO: Temporary fix - "https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404), + "https://dashboard.aspose.cloud/applications": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), } REGEX_TO_IGNORE: list[re.Pattern[str]] = [ @@ -170,59 +154,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, raise -class Task: - _proc: subprocess.Popen[bytes] - _stderr: str | None - - def __init__(self, url: str): - self.url = url - self._proc = subprocess.Popen( - [ - "curl", - "-sSf", - "--output", - "-", - "--connect-timeout", - str(CONNECT_TIMEOUT_SEC), - "--max-time", - str(MAX_TIME_SEC), - "--user-agent", - USER_AGENT, - self.url, - ], - stdout=open(os.devnull, "w"), - stderr=subprocess.PIPE, - ) - self._stderr = None - self._started = time.time() - - @property - def running(self) -> bool: - return self._proc.poll() is None - - @property - def ret_code(self) -> int: - assert not self.running - return self._proc.returncode - - @property - def stderr(self) -> str: - assert not self.running - if self._stderr is None: - self._stderr = self._proc.stderr.read().decode() - return self._stderr - - @property - def age(self) -> float: - return time.time() - self._started - - -def create_new_task(url: str) -> Task: - # print("Create task:", url) - return Task(url) - - -def process_finished_task(task: Task) -> None: +def process_finished_task(task) -> None: # print("Finish task:", task.url) expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None)) if task.ret_code == 0 or task.ret_code == expected_ret_code: @@ -230,9 +162,9 @@ def process_finished_task(task: Task) -> None: JOB_SUMMARY.add_success(task.url) return - if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code: + if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code: # Try parse stderr for HTTP code - match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr) + match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr) assert match, "Unexpected output: %s" % task.stderr http_code = int(match.groupdict()["http_code"]) if http_code == expected_http_code: @@ -247,56 +179,31 @@ def process_finished_task(task: Task) -> None: JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}") -WORKER_QUEUE: Queue[str | None] = Queue() - - -def url_checker(num_workers: int = 8) -> None: - next_report_age_sec = 5 - workers: list[Task | None] = [None for _ in range(num_workers)] - - queue_is_empty = False - - while not queue_is_empty or any(workers): - for i, task in enumerate(workers): - if task is None: - continue - if not task.running: - process_finished_task(task) - workers[i] = None - elif task.age > next_report_age_sec: - print("Long request: '%s' %.2fs" % (task.url, task.age)) - next_report_age_sec += 3 - - if not queue_is_empty: - for i in (i for (i, w) in enumerate(workers) if w is None): - # Avoid blocking forever if the queue is currently empty - try: - item = WORKER_QUEUE.get_nowait() - except Empty: - break - if item is None: - queue_is_empty = True - print("--- url queue is over ---") - break - url = item - workers[i] = create_new_task(url) - time.sleep(0.2) - print("Worker finished") - - JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) JOB_SUMMARY.add_header("Test all URLs") def main(files: list[str]) -> int: - checker = threading.Thread(target=url_checker, daemon=True) + url_checker = UrlChecker( + on_finish=process_finished_task, + ) + + # Setup signal handlers for graceful shutdown + def _handle_signal(_sig: int, _frame: typing.Any) -> None: + url_checker.stop() + + with contextlib.suppress(Exception): + signal.signal(signal.SIGINT, _handle_signal) + signal.signal(signal.SIGTERM, _handle_signal) + + checker = threading.Thread(target=url_checker.run, daemon=True) checker.start() for filename, text in text_extractor(files): for url in url_extractor(text, filename): # print("In:", url) - WORKER_QUEUE.put_nowait(url) - WORKER_QUEUE.put_nowait(None) + url_checker.add_url(url) + url_checker.close() checker.join(timeout=JOIN_TIMEOUT_SEC) if checker.is_alive(): print( diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py new file mode 100644 index 0000000..336a093 --- /dev/null +++ b/scripts/curl_wrapper.py @@ -0,0 +1,86 @@ +import contextlib +import os +import re +import subprocess +import time +from typing import Optional + +# To avoid 403 responses (default); caller may override per instance +DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" + + +class EXIT_CODES: + OK = 0 + COULDNT_RESOLVE_HOST = 6 + HTTP_RETURNED_ERROR = 22 + + +class CurlWrapper: + """ + Encapsulates a single curl execution with timeouts and helpers. + See: https://curl.se/libcurl/c/libcurl-errors.html + """ + + CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P\d+)") + + def __init__( + self, + url: str, + *, + user_agent: str = DEFAULT_USER_AGENT, + connect_timeout: int = 5, + max_time: int = 10, + ) -> None: + self.url = url + self._stderr: Optional[str] = None + self._started = time.time() + self._proc = subprocess.Popen( + [ + "curl", + "-sSf", + "--output", + "-", + "--connect-timeout", + str(connect_timeout), + "--max-time", + str(max_time), + "--user-agent", + user_agent, + self.url, + ], + stdout=open(os.devnull, "w"), + stderr=subprocess.PIPE, + ) + + @property + def running(self) -> bool: + return self._proc.poll() is None + + @property + def ret_code(self) -> int: + assert not self.running + return self._proc.returncode + + @property + def stderr(self) -> str: + assert not self.running + if self._stderr is None: + assert self._proc.stderr is not None + self._stderr = self._proc.stderr.read().decode() + return self._stderr + + @property + def age(self) -> float: + return time.time() - self._started + + def terminate(self, timeout: float | None = None) -> None: + try: + self._proc.terminate() + if timeout is not None: + self._proc.wait(timeout=timeout) + except Exception: + pass + + def kill(self) -> None: + with contextlib.suppress(Exception): + self._proc.kill() diff --git a/scripts/url_checker.py b/scripts/url_checker.py new file mode 100644 index 0000000..559e0fe --- /dev/null +++ b/scripts/url_checker.py @@ -0,0 +1,82 @@ +import contextlib +import time +from queue import Queue, Empty +from typing import Callable, Optional + +from curl_wrapper import CurlWrapper + + +class UrlChecker: + def __init__( + self, + *, + num_workers: int = 8, + hard_kill_sec: int = 15, + on_finish: Optional[Callable[[CurlWrapper], None]] = None, + worker_factory: Optional[Callable[[str], CurlWrapper]] = None, + ) -> None: + self.num_workers = num_workers + self.hard_kill_sec = hard_kill_sec + self.on_finish = on_finish + self.worker_factory = worker_factory or (lambda url: CurlWrapper(url)) + + self.queue: Queue[str | None] = Queue() + self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)] + self.stop_event = False + self.next_report_age_sec = 5 + + def add_url(self, url: str) -> None: + self.queue.put_nowait(url) + + def close(self) -> None: + self.queue.put_nowait(None) + + def stop(self) -> None: + self.stop_event = True + with contextlib.suppress(Exception): + self.queue.put_nowait(None) + + def run(self) -> None: + queue_is_empty = False + while not queue_is_empty or any(self.workers): + # Graceful stop: cancel running curls + if self.stop_event: + queue_is_empty = True + for t in self.workers: + if t is not None and t.running: + t.terminate(timeout=1) + if t.running: + t.kill() + + # Tick workers + for i, task in enumerate(self.workers): + if task is None: + continue + if not task.running: + if self.on_finish is not None: + self.on_finish(task) + self.workers[i] = None + elif task.age > self.next_report_age_sec: + print("Long request: '%s' %.2fs" % (task.url, task.age)) + self.next_report_age_sec += 3 + if task.age > self.hard_kill_sec: + task.terminate(timeout=2) + if task.running: + task.kill() + print("Killed long request: '%s' %.2fs" % (task.url, task.age)) + + # Fill idle workers + if not queue_is_empty: + for i in (i for (i, w) in enumerate(self.workers) if w is None): + try: + item = self.queue.get_nowait() + except Empty: + break + if item is None: + queue_is_empty = True + print("--- url queue is over ---") + break + url = item + self.workers[i] = self.worker_factory(url) + time.sleep(0.2) + print("Worker finished") From 6a4f576a6982b6692adac9071f613a59159bf3c6 Mon Sep 17 00:00:00 2001 From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com> Date: Thu, 16 Oct 2025 18:23:20 +0700 Subject: [PATCH 2/6] Refactoring --- scripts/check-urls.py | 52 +++++++++++--------------------- scripts/url_checker.py | 67 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/scripts/check-urls.py b/scripts/check-urls.py index 421745b..96bde0b 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -11,7 +11,6 @@ from github_job_summary import JobSummary from subdomains import Subdomains from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES -from curl_wrapper import CurlWrapper from url_checker import UrlChecker """ @@ -21,9 +20,9 @@ Check them with CURL """ -JOIN_TIMEOUT_SEC = 120 +JOIN_TIMEOUT_SEC: int = 120 -CURL_EXIT_CODES_AND_HTTP_CODES = { +CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = { "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), @@ -45,7 +44,7 @@ ] ) -IGNORE_DOMAINS = Subdomains( +IGNORE_DOMAINS: Subdomains = Subdomains( [ ".android.com", ".apache.org", @@ -82,10 +81,10 @@ ] ) -URL_END_CHARS = r",#\)\"'<>\*\s\\" -URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS) +URL_END_CHARS: str = r",#\)\"'<>\*\s\\" +URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS) # print(URL_RE_PATTERN) -EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE) +EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE) # URL : [Files] EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE} @@ -129,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None] EXTRACTED_URLS_WITH_FILES[url].append(filename) -FILES_TO_IGNORE = frozenset( +FILES_TO_IGNORE: frozenset[str] = frozenset( [ ".jar", ".jar", @@ -154,38 +153,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, raise -def process_finished_task(task) -> None: - # print("Finish task:", task.url) - expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None)) - if task.ret_code == 0 or task.ret_code == expected_ret_code: - print("OK:", "'%s' %.2fs" % (task.url, task.age)) - JOB_SUMMARY.add_success(task.url) - return - - if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code: - # Try parse stderr for HTTP code - match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr) - assert match, "Unexpected output: %s" % task.stderr - http_code = int(match.groupdict()["http_code"]) - if http_code == expected_http_code: - print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age)) - JOB_SUMMARY.add_success(task.url) - return - - print( - "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr), - file=sys.stderr, - ) - JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}") - - -JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) +JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) JOB_SUMMARY.add_header("Test all URLs") def main(files: list[str]) -> int: url_checker = UrlChecker( - on_finish=process_finished_task, + expectations=CURL_EXIT_CODES_AND_HTTP_CODES, ) # Setup signal handlers for graceful shutdown @@ -212,6 +186,14 @@ def _handle_signal(_sig: int, _frame: typing.Any) -> None: flush=True, ) + # Collect results and write summary + for res in url_checker.results: + if res.ok: + JOB_SUMMARY.add_success(res.url) + else: + files = EXTRACTED_URLS_WITH_FILES.get(res.url, []) + JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}") + JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}") if JOB_SUMMARY.has_errors: print(JOB_SUMMARY, file=sys.stderr, flush=True) diff --git a/scripts/url_checker.py b/scripts/url_checker.py index 559e0fe..22ef367 100644 --- a/scripts/url_checker.py +++ b/scripts/url_checker.py @@ -1,9 +1,23 @@ import contextlib +import sys import time +from dataclasses import dataclass from queue import Queue, Empty from typing import Callable, Optional -from curl_wrapper import CurlWrapper +from curl_wrapper import CurlWrapper, EXIT_CODES + + +@dataclass +class CheckResult: + url: str + ok: bool + ret_code: int + age: float + stderr: str + expected_ret_code: int + expected_http_code: int | None + http_code: int | None class UrlChecker: @@ -12,18 +26,19 @@ def __init__( *, num_workers: int = 8, hard_kill_sec: int = 15, - on_finish: Optional[Callable[[CurlWrapper], None]] = None, + expectations: dict[str, tuple[int, int | None]] | None = None, worker_factory: Optional[Callable[[str], CurlWrapper]] = None, ) -> None: self.num_workers = num_workers self.hard_kill_sec = hard_kill_sec - self.on_finish = on_finish + self.expectations = expectations or {} self.worker_factory = worker_factory or (lambda url: CurlWrapper(url)) self.queue: Queue[str | None] = Queue() self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)] self.stop_event = False self.next_report_age_sec = 5 + self.results: list[CheckResult] = [] def add_url(self, url: str) -> None: self.queue.put_nowait(url) @@ -53,8 +68,7 @@ def run(self) -> None: if task is None: continue if not task.running: - if self.on_finish is not None: - self.on_finish(task) + self._process_finished(task) self.workers[i] = None elif task.age > self.next_report_age_sec: print("Long request: '%s' %.2fs" % (task.url, task.age)) @@ -80,3 +94,46 @@ def run(self) -> None: self.workers[i] = self.worker_factory(url) time.sleep(0.2) print("Worker finished") + + def _process_finished(self, task: CurlWrapper) -> None: + expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None)) + + ok: bool = False + http_code_val: int | None = None + stderr_out: str = task.stderr + + # Fast path: exact expected ret code or success + if task.ret_code == 0 or task.ret_code == expected_ret_code: + print("OK:", "'%s' %.2fs" % (task.url, task.age)) + ok = True + stderr_out = "" + else: + # If curl reports HTTP error (22), attempt to parse HTTP code to compare + if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code: + match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr) + assert match, "Unexpected output: %s" % task.stderr + http_code_val = int(match.groupdict()["http_code"]) + if http_code_val == expected_http_code: + print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age)) + ok = True + + if not ok: + # Otherwise, report error + print( + "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr), + file=sys.stderr, + ) + + # Append exactly once + self.results.append( + CheckResult( + url=task.url, + ok=ok, + ret_code=task.ret_code, + age=task.age, + stderr=stderr_out, + expected_ret_code=expected_ret_code, + expected_http_code=expected_http_code, + http_code=http_code_val, + ) + ) From fed631aa86b9fa42d4d8f4031884318542138824 Mon Sep 17 00:00:00 2001 From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:24:19 +0700 Subject: [PATCH 3/6] Using context --- scripts/check-urls.py | 47 +++++++++++++-------------------------- scripts/url_checker.py | 50 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 34 deletions(-) diff --git a/scripts/check-urls.py b/scripts/check-urls.py index 96bde0b..cac15c1 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -20,7 +20,7 @@ Check them with CURL """ -JOIN_TIMEOUT_SEC: int = 120 +JOIN_TIMEOUT_SEC = 120 CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = { "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), @@ -36,7 +36,7 @@ re.compile(r"^https://github\.com/(?P[^/]+)/(?P[^/]+)/(?:blob|issues)/\S+$"), ] -URLS_TO_IGNORE: frozenset[str] = frozenset( +URLS_TO_IGNORE = frozenset( [ "https://api.aspose.cloud", "https://www.aspose.cloud/404", @@ -44,7 +44,7 @@ ] ) -IGNORE_DOMAINS: Subdomains = Subdomains( +IGNORE_DOMAINS = Subdomains( [ ".android.com", ".apache.org", @@ -81,10 +81,10 @@ ] ) -URL_END_CHARS: str = r",#\)\"'<>\*\s\\" -URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS) +URL_END_CHARS = r",#\)\"'<>\*\s\\" +URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS) # print(URL_RE_PATTERN) -EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE) +EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE) # URL : [Files] EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE} @@ -128,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None] EXTRACTED_URLS_WITH_FILES[url].append(filename) -FILES_TO_IGNORE: frozenset[str] = frozenset( +FILES_TO_IGNORE = frozenset( [ ".jar", ".jar", @@ -153,7 +153,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, raise -JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) +JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md")) JOB_SUMMARY.add_header("Test all URLs") @@ -162,32 +162,15 @@ def main(files: list[str]) -> int: expectations=CURL_EXIT_CODES_AND_HTTP_CODES, ) - # Setup signal handlers for graceful shutdown - def _handle_signal(_sig: int, _frame: typing.Any) -> None: - url_checker.stop() - - with contextlib.suppress(Exception): - signal.signal(signal.SIGINT, _handle_signal) - signal.signal(signal.SIGTERM, _handle_signal) - - checker = threading.Thread(target=url_checker.run, daemon=True) - checker.start() - - for filename, text in text_extractor(files): - for url in url_extractor(text, filename): - # print("In:", url) - url_checker.add_url(url) - url_checker.close() - checker.join(timeout=JOIN_TIMEOUT_SEC) - if checker.is_alive(): - print( - f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.", - file=sys.stderr, - flush=True, - ) + with url_checker.start() as checker: + for filename, text in text_extractor(files): + for url in url_extractor(text, filename): + checker.add_url(url) + checker.wait(JOIN_TIMEOUT_SEC) + results = url_checker.results # Collect results and write summary - for res in url_checker.results: + for res in results: if res.ok: JOB_SUMMARY.add_success(res.url) else: diff --git a/scripts/url_checker.py b/scripts/url_checker.py index 22ef367..91cde7a 100644 --- a/scripts/url_checker.py +++ b/scripts/url_checker.py @@ -3,7 +3,9 @@ import time from dataclasses import dataclass from queue import Queue, Empty -from typing import Callable, Optional +from typing import Callable, Optional, Iterable +from types import TracebackType +import threading from curl_wrapper import CurlWrapper, EXIT_CODES @@ -39,12 +41,16 @@ def __init__( self.stop_event = False self.next_report_age_sec = 5 self.results: list[CheckResult] = [] + self._thread: threading.Thread | None = None + self._closed: bool = False def add_url(self, url: str) -> None: self.queue.put_nowait(url) def close(self) -> None: - self.queue.put_nowait(None) + if not self._closed: + self._closed = True + self.queue.put_nowait(None) def stop(self) -> None: self.stop_event = True @@ -95,6 +101,46 @@ def run(self) -> None: time.sleep(0.2) print("Worker finished") + # Context management and user-friendly API + def start(self) -> "UrlChecker": + if self._thread is not None: + return self + self._thread = threading.Thread(target=self.run, daemon=True) + self._thread.start() + return self + + def __enter__(self) -> "UrlChecker": + return self.start() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> None: + # Ensure we signal end of input and wait for completion + self.close() + self.wait() + + def wait(self, join_timeout_sec: float | None = None) -> None: + # Ensure end-of-input signaled before waiting + self.close() + t = self._thread + if t is None: + return + if join_timeout_sec is not None: + t.join(timeout=join_timeout_sec) + if t.is_alive(): + # Try to stop gracefully and inform user + self.stop() + print( + f"URL checker did not finish within {join_timeout_sec}s; exiting early.", + file=sys.stderr, + flush=True, + ) + else: + t.join() + def _process_finished(self, task: CurlWrapper) -> None: expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None)) From 5351dd28d5e27b406ade5844d929b0369909a548 Mon Sep 17 00:00:00 2001 From: Denis Averin <59285247+Denis-Averin@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:37:38 +0700 Subject: [PATCH 4/6] Renames --- scripts/check-urls.py | 24 +++++++++++------------- scripts/curl_wrapper.py | 7 +++++-- scripts/url_checker.py | 22 +++++++++++----------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/scripts/check-urls.py b/scripts/check-urls.py index cac15c1..00beed9 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -1,16 +1,14 @@ import contextlib import fileinput -import signal import os import re import sys -import threading import typing import urllib.parse from github_job_summary import JobSummary from subdomains import Subdomains -from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES +from curl_wrapper import CurlExitCodes from url_checker import UrlChecker """ @@ -22,14 +20,14 @@ JOIN_TIMEOUT_SEC = 120 -CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = { - "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), - "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), - "https://api.aspose.cloud/v4.0/": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), - "https://id.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400), +EXIT_CODE_EXPECTATIONS: dict[str, tuple[int, int | None]] = { + "https://api.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), + "https://api.aspose.cloud/v3.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://api.aspose.cloud/v4.0/": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), + "https://id.aspose.cloud/connect/token": (CurlExitCodes.HTTP_RETURNED_ERROR, 400), # TODO: Temporary fix - "https://dashboard.aspose.cloud/applications": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404), + "https://dashboard.aspose.cloud/applications": (CurlExitCodes.HTTP_RETURNED_ERROR, 404), } REGEX_TO_IGNORE: list[re.Pattern[str]] = [ @@ -159,7 +157,7 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, def main(files: list[str]) -> int: url_checker = UrlChecker( - expectations=CURL_EXIT_CODES_AND_HTTP_CODES, + expectations=EXIT_CODE_EXPECTATIONS, ) with url_checker.start() as checker: @@ -174,8 +172,8 @@ def main(files: list[str]) -> int: if res.ok: JOB_SUMMARY.add_success(res.url) else: - files = EXTRACTED_URLS_WITH_FILES.get(res.url, []) - JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}") + src_files = EXTRACTED_URLS_WITH_FILES.get(res.url, []) + JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {src_files}") JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}") if JOB_SUMMARY.has_errors: diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py index 336a093..a11734e 100644 --- a/scripts/curl_wrapper.py +++ b/scripts/curl_wrapper.py @@ -9,7 +9,11 @@ DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" -class EXIT_CODES: +class CurlExitCodes: + """ + See: https://curl.se/libcurl/c/libcurl-errors.html + """ + OK = 0 COULDNT_RESOLVE_HOST = 6 HTTP_RETURNED_ERROR = 22 @@ -18,7 +22,6 @@ class EXIT_CODES: class CurlWrapper: """ Encapsulates a single curl execution with timeouts and helpers. - See: https://curl.se/libcurl/c/libcurl-errors.html """ CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P\d+)") diff --git a/scripts/url_checker.py b/scripts/url_checker.py index 91cde7a..697a453 100644 --- a/scripts/url_checker.py +++ b/scripts/url_checker.py @@ -1,13 +1,13 @@ import contextlib import sys +import threading import time from dataclasses import dataclass from queue import Queue, Empty -from typing import Callable, Optional, Iterable from types import TracebackType -import threading +from typing import Callable, Optional -from curl_wrapper import CurlWrapper, EXIT_CODES +from curl_wrapper import CurlWrapper, CurlExitCodes @dataclass @@ -47,17 +47,17 @@ def __init__( def add_url(self, url: str) -> None: self.queue.put_nowait(url) - def close(self) -> None: + def _close(self) -> None: if not self._closed: self._closed = True self.queue.put_nowait(None) - def stop(self) -> None: + def _stop(self) -> None: self.stop_event = True with contextlib.suppress(Exception): self.queue.put_nowait(None) - def run(self) -> None: + def _run(self) -> None: queue_is_empty = False while not queue_is_empty or any(self.workers): # Graceful stop: cancel running curls @@ -105,7 +105,7 @@ def run(self) -> None: def start(self) -> "UrlChecker": if self._thread is not None: return self - self._thread = threading.Thread(target=self.run, daemon=True) + self._thread = threading.Thread(target=self._run, daemon=True) self._thread.start() return self @@ -119,12 +119,12 @@ def __exit__( tb: TracebackType | None, ) -> None: # Ensure we signal end of input and wait for completion - self.close() + self._close() self.wait() def wait(self, join_timeout_sec: float | None = None) -> None: # Ensure end-of-input signaled before waiting - self.close() + self._close() t = self._thread if t is None: return @@ -132,7 +132,7 @@ def wait(self, join_timeout_sec: float | None = None) -> None: t.join(timeout=join_timeout_sec) if t.is_alive(): # Try to stop gracefully and inform user - self.stop() + self._stop() print( f"URL checker did not finish within {join_timeout_sec}s; exiting early.", file=sys.stderr, @@ -155,7 +155,7 @@ def _process_finished(self, task: CurlWrapper) -> None: stderr_out = "" else: # If curl reports HTTP error (22), attempt to parse HTTP code to compare - if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code: + if task.ret_code == CurlExitCodes.HTTP_RETURNED_ERROR and expected_http_code: match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr) assert match, "Unexpected output: %s" % task.stderr http_code_val = int(match.groupdict()["http_code"]) From a5e34049189653b89bedc8433497e34e1dec1636 Mon Sep 17 00:00:00 2001 From: Denis Averin Date: Thu, 30 Oct 2025 18:46:49 +0700 Subject: [PATCH 5/6] Code cleanup --- scripts/check_all_urls.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/check_all_urls.sh b/scripts/check_all_urls.sh index c7b58e1..79f9f59 100755 --- a/scripts/check_all_urls.sh +++ b/scripts/check_all_urls.sh @@ -5,9 +5,6 @@ set -euo pipefail SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" ROOT_DIR="$( cd "${SCRIPT_DIR}/.." &> /dev/null && pwd )" -check_file () { - echo "$1" -} pushd "${ROOT_DIR}" git ls-files --recurse-submodules --exclude-standard --full-name | grep -v 'package-lock.json$' | python "${SCRIPT_DIR}/check-urls.py" popd From 8601d4d770917176aa1872f0188dc8938b0f7b65 Mon Sep 17 00:00:00 2001 From: Denis Averin Date: Thu, 30 Oct 2025 23:03:49 +0700 Subject: [PATCH 6/6] Added max-redirects --- scripts/curl_wrapper.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/curl_wrapper.py b/scripts/curl_wrapper.py index a11734e..95d01e7 100644 --- a/scripts/curl_wrapper.py +++ b/scripts/curl_wrapper.py @@ -33,6 +33,7 @@ def __init__( user_agent: str = DEFAULT_USER_AGENT, connect_timeout: int = 5, max_time: int = 10, + max_redirects: int = 3, ) -> None: self.url = url self._stderr: Optional[str] = None @@ -41,14 +42,20 @@ def __init__( [ "curl", "-sSf", + "-L", # follow redirects + "--max-redirs", + f"{max_redirects}", # limit number of redirects + # "--proto", "=https", # (optional) only allow https for the initial URL + "--proto-redir", + "=all,https", # only allow https after redirects; http will fail "--output", - "-", + "-", # discard body "--connect-timeout", - str(connect_timeout), + f"{connect_timeout}", "--max-time", - str(max_time), + f"{max_time}", "--user-agent", - user_agent, + f"{user_agent}", self.url, ], stdout=open(os.devnull, "w"),