From 9f6826f8bf0e6dd366e8021af32530dadafee199 Mon Sep 17 00:00:00 2001 From: Alexander Moses Date: Mon, 17 Nov 2025 08:34:09 +0000 Subject: [PATCH 1/2] Bug 1387418: Handle empty push resultset from hg This patch parses the hg push resultset and handles cases where pushes are not present by raising an appropriate error and logging the error correctly. --- tests/etl/test_push_loader.py | 16 ++++++++++++ treeherder/etl/push_loader.py | 46 ++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tests/etl/test_push_loader.py b/tests/etl/test_push_loader.py index a6ce96df0ab..bdbe1608314 100644 --- a/tests/etl/test_push_loader.py +++ b/tests/etl/test_push_loader.py @@ -11,6 +11,7 @@ HgPushTransformer, PulsePushError, PushLoader, + ResultsetFetchError, ) from treeherder.model.models import Push @@ -218,3 +219,18 @@ def test_ingest_github_push_comma_separated_branches( "https://firefox-ci-tc.services.mozilla.com", ) assert Push.objects.count() == expected_pushes + + +def test_fetch_push_raises_on_empty_pushes(monkeypatch): + """Test that a ResultsetFetchError is raised when fetch_json returns a dict without 'pushes'""" + monkeypatch.setattr("treeherder.etl.push_loader.fetch_json", lambda url: {}) + transformer = HgPushTransformer( + { + "payload": { + "repo_url": "https://hg.mozilla.org/try", + "pushlog_pushes": [{"push_full_json_url": "http://example"}], + } + } + ) + with pytest.raises(ResultsetFetchError): + transformer.fetch_push("http://example", repository="try") diff --git a/treeherder/etl/push_loader.py b/treeherder/etl/push_loader.py index 05e3a67a5ae..08b2407c3ca 100644 --- a/treeherder/etl/push_loader.py +++ b/treeherder/etl/push_loader.py @@ -261,7 +261,45 @@ def fetch_push(self, url, repository, sha=None): logger.debug("fetching for %s %s", repository, url) # there will only ever be one, with this url - push = list(fetch_json(url)["pushes"].values())[0] + try: + data = fetch_json(url) + except Exception as e: + logger.exception("Failed fetching push JSON from %s for %s: %s", url, repository, e) + try: + newrelic.agent.record_custom_event( + "resultset_fetch_failure", + {"url": url, "repository": repository, "error": str(e)}, + ) + except Exception: + # NewRelic failures should not block ingestion + logger.debug("NewRelic event failed for fetch error") + raise ResultsetFetchError(f"Failed to fecth JSON from {url}: {e}") from e + + pushes = None + if isinstance(data, dict): + pushes = data.get("pushes") + + if not pushes or not isinstance(pushes, dict): + # Log deg info and raise error + data_keys = list(data.keys()) if isinstance(data, dict) else None + logger.warning( + f"Malformed or empty push JSON from {url} for {repository}: data-keys={data_keys}" + ) + try: + newrelic.agent.record_custom_event( + "resultset_fetch_malformed", + {"url": url, "repository": repository, "data_keys": data_keys}, + ) + except Exception: + logger.debug("NewRelic event failed for malformed data") + raise ResultsetFetchError(f"Malformed or empty 'pushes' for {url}") + + # Safely get the first push + try: + push = next(iter(pushes.values())) + except Exception as e: + logger.exception("Failed to extract first push from pushes for {respository} {url}:{e}") + raise ResultsetFetchError(f"Unable to extract push from 'pushes' for {url}: {e}") from e commits = [] # we only want to ingest the last 200 commits for each push, @@ -285,3 +323,9 @@ def fetch_push(self, url, repository, sha=None): class PulsePushError(ValueError): pass + + +class ResultsetFetchError(Exception): + """Raised when fetching or parsing a push resultset fails.""" + + pass From e3d3da75c5872301e5d5c0bf774340bb76a64bed Mon Sep 17 00:00:00 2001 From: Alexander Moses Date: Wed, 19 Nov 2025 07:13:21 +0000 Subject: [PATCH 2/2] Bug 1387418: Address review comments Made the below changes to push_loader.py * Used an f-string in line 267 to log exception * Corrected comment where we log data in warning * Used `NewRelic event failed for malformed Hg push data` in place of `NewRelic event failed for malformed data` * Inserted f-string declaration and corrected typo in it too * Renamed `ResultsetFetchError` to `HgPushFetchError` * Renamed `resultset_fetch_failure` to `hg_push_fetch_failure` * Corrected typo from `fecht` to `fetch` Made the below change to test_push_loader.py * Replace `ResultsetFetchError` with `HgPushFetchError` --- tests/etl/test_push_loader.py | 6 +++--- treeherder/etl/push_loader.py | 22 ++++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/etl/test_push_loader.py b/tests/etl/test_push_loader.py index bdbe1608314..e51a19c5b5d 100644 --- a/tests/etl/test_push_loader.py +++ b/tests/etl/test_push_loader.py @@ -8,10 +8,10 @@ from treeherder.etl.push_loader import ( GithubPullRequestTransformer, GithubPushTransformer, + HgPushFetchError, HgPushTransformer, PulsePushError, PushLoader, - ResultsetFetchError, ) from treeherder.model.models import Push @@ -222,7 +222,7 @@ def test_ingest_github_push_comma_separated_branches( def test_fetch_push_raises_on_empty_pushes(monkeypatch): - """Test that a ResultsetFetchError is raised when fetch_json returns a dict without 'pushes'""" + """Test that a HgPushFetchError is raised when fetch_json returns a dict without 'pushes'""" monkeypatch.setattr("treeherder.etl.push_loader.fetch_json", lambda url: {}) transformer = HgPushTransformer( { @@ -232,5 +232,5 @@ def test_fetch_push_raises_on_empty_pushes(monkeypatch): } } ) - with pytest.raises(ResultsetFetchError): + with pytest.raises(HgPushFetchError): transformer.fetch_push("http://example", repository="try") diff --git a/treeherder/etl/push_loader.py b/treeherder/etl/push_loader.py index 08b2407c3ca..168c73d08a0 100644 --- a/treeherder/etl/push_loader.py +++ b/treeherder/etl/push_loader.py @@ -264,42 +264,44 @@ def fetch_push(self, url, repository, sha=None): try: data = fetch_json(url) except Exception as e: - logger.exception("Failed fetching push JSON from %s for %s: %s", url, repository, e) + logger.exception(f"Failed fetching push JSON from {url} for {repository}: {e}") try: newrelic.agent.record_custom_event( - "resultset_fetch_failure", + "hg_push_fetch_failure", {"url": url, "repository": repository, "error": str(e)}, ) except Exception: # NewRelic failures should not block ingestion logger.debug("NewRelic event failed for fetch error") - raise ResultsetFetchError(f"Failed to fecth JSON from {url}: {e}") from e + raise HgPushFetchError(f"Failed to fetch JSON from {url}: {e}") from e pushes = None if isinstance(data, dict): pushes = data.get("pushes") if not pushes or not isinstance(pushes, dict): - # Log deg info and raise error + # Log data in warning and raise error data_keys = list(data.keys()) if isinstance(data, dict) else None logger.warning( f"Malformed or empty push JSON from {url} for {repository}: data-keys={data_keys}" ) try: newrelic.agent.record_custom_event( - "resultset_fetch_malformed", + "hg_push_fetch_malformed", {"url": url, "repository": repository, "data_keys": data_keys}, ) except Exception: - logger.debug("NewRelic event failed for malformed data") - raise ResultsetFetchError(f"Malformed or empty 'pushes' for {url}") + logger.debug("NewRelic event failed for malformed Hg push data") + raise HgPushFetchError(f"Malformed or empty 'pushes' for {url}") # Safely get the first push try: push = next(iter(pushes.values())) except Exception as e: - logger.exception("Failed to extract first push from pushes for {respository} {url}:{e}") - raise ResultsetFetchError(f"Unable to extract push from 'pushes' for {url}: {e}") from e + logger.exception( + f"Failed to extract first push from pushes for {repository} {url}: {e}" + ) + raise HgPushFetchError(f"Unable to extract push from 'pushes' for {url}: {e}") from e commits = [] # we only want to ingest the last 200 commits for each push, @@ -325,7 +327,7 @@ class PulsePushError(ValueError): pass -class ResultsetFetchError(Exception): +class HgPushFetchError(Exception): """Raised when fetching or parsing a push resultset fails.""" pass