Skip to content

Commit e0350cf

Browse files
authored
ref(similarity): Add option to skip processed and chosen projects in backfill (#75084)
Allow already backfilled projects and manually included projects to be skipped
1 parent f9a20be commit e0350cf

File tree

4 files changed

+215
-3
lines changed

4 files changed

+215
-3
lines changed

src/sentry/api/endpoints/project_backfill_similar_issues_embeddings_records.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def post(self, request: Request, project) -> Response:
3434
last_processed_id = None
3535
only_delete = False
3636
enable_ingestion = False
37+
skip_processed_projects = False
38+
skip_project_ids = None
3739

3840
if request.data.get("last_processed_id"):
3941
last_processed_id = int(request.data["last_processed_id"])
@@ -42,12 +44,20 @@ def post(self, request: Request, project) -> Response:
4244
only_delete = True
4345

4446
if request.data.get("enable_ingestion"):
45-
enable_ingestion = request.data["enable_ingestion"] == "true"
47+
enable_ingestion = True
48+
49+
if request.data.get("skip_processed_projects"):
50+
skip_processed_projects = True
51+
52+
if request.data.get("skip_project_ids"):
53+
skip_project_ids = request.data["skip_project_ids"]
4654

4755
backfill_seer_grouping_records_for_project.delay(
4856
current_project_id=project.id,
4957
last_processed_group_id_input=last_processed_id,
5058
only_delete=only_delete,
5159
enable_ingestion=enable_ingestion,
60+
skip_processed_projects=skip_processed_projects,
61+
skip_project_ids=skip_project_ids,
5262
)
5363
return Response(status=204)

src/sentry/tasks/embeddings_grouping/backfill_seer_grouping_records_for_project.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ def backfill_seer_grouping_records_for_project(
4545
last_processed_project_index_input: int | None = None,
4646
only_delete: bool = False,
4747
enable_ingestion: bool = False,
48+
skip_processed_projects: bool = False,
49+
skip_project_ids: list[int] | None = None,
4850
*args: Any,
4951
**kwargs: Any,
5052
) -> None:
@@ -99,6 +101,30 @@ def backfill_seer_grouping_records_for_project(
99101
)
100102
return
101103

104+
is_project_processed = (
105+
skip_processed_projects
106+
and project.get_option("sentry:similarity_backfill_completed") is not None
107+
)
108+
is_project_skipped = skip_project_ids and project.id in skip_project_ids
109+
if is_project_processed or is_project_skipped:
110+
logger.info(
111+
"backfill_seer_grouping_records.project_skipped",
112+
extra={
113+
"project_id": current_project_id,
114+
"project_already_processed": is_project_processed,
115+
"project_manually_skipped": is_project_skipped,
116+
},
117+
)
118+
call_next_backfill(
119+
last_processed_group_id=None,
120+
project_id=current_project_id,
121+
last_processed_project_index=last_processed_project_index,
122+
cohort=cohort,
123+
only_delete=only_delete,
124+
enable_ingestion=enable_ingestion,
125+
)
126+
return
127+
102128
if only_delete:
103129
delete_seer_grouping_records(current_project_id)
104130
logger.info(

tests/sentry/api/endpoints/test_project_backfill_similar_issues_embeddings_records.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ def test_post_success_no_last_processed_id(
5151
last_processed_group_id_input=None,
5252
only_delete=False,
5353
enable_ingestion=False,
54+
skip_processed_projects=False,
55+
skip_project_ids=None,
5456
)
5557

5658
@patch(
@@ -68,6 +70,8 @@ def test_post_success_no_last_processed_id_single_org(
6870
last_processed_group_id_input=None,
6971
only_delete=False,
7072
enable_ingestion=False,
73+
skip_processed_projects=False,
74+
skip_project_ids=None,
7175
)
7276

7377
@patch(
@@ -88,6 +92,8 @@ def test_post_success_last_processed_id(
8892
last_processed_group_id_input=8,
8993
only_delete=False,
9094
enable_ingestion=False,
95+
skip_processed_projects=False,
96+
skip_project_ids=None,
9197
)
9298

9399
@patch(
@@ -110,6 +116,8 @@ def test_post_success_only_delete(
110116
last_processed_group_id_input=8,
111117
only_delete=True,
112118
enable_ingestion=False,
119+
skip_processed_projects=False,
120+
skip_project_ids=None,
113121
)
114122

115123
@patch(
@@ -132,4 +140,54 @@ def test_post_success_enable_ingestion(
132140
last_processed_group_id_input=8,
133141
only_delete=False,
134142
enable_ingestion=True,
143+
skip_processed_projects=False,
144+
skip_project_ids=None,
145+
)
146+
147+
@patch(
148+
"sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.is_active_superuser",
149+
return_value=True,
150+
)
151+
@patch(
152+
"sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.backfill_seer_grouping_records_for_project.delay"
153+
)
154+
@with_feature("projects:similarity-embeddings-backfill")
155+
def test_post_success_skip_processed_projects(
156+
self, mock_backfill_seer_grouping_records, mock_is_active_superuser
157+
):
158+
response = self.client.post(
159+
self.url, data={"last_processed_id": "8", "skip_processed_projects": "true"}
160+
)
161+
assert response.status_code == 204, response.content
162+
mock_backfill_seer_grouping_records.assert_called_with(
163+
current_project_id=self.project.id,
164+
last_processed_group_id_input=8,
165+
only_delete=False,
166+
enable_ingestion=False,
167+
skip_processed_projects=True,
168+
skip_project_ids=None,
169+
)
170+
171+
@patch(
172+
"sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.is_active_superuser",
173+
return_value=True,
174+
)
175+
@patch(
176+
"sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.backfill_seer_grouping_records_for_project.delay"
177+
)
178+
@with_feature("projects:similarity-embeddings-backfill")
179+
def test_post_success_skip_project_ids(
180+
self, mock_backfill_seer_grouping_records, mock_is_active_superuser
181+
):
182+
response = self.client.post(
183+
self.url, data={"last_processed_id": "8", "skip_project_ids": [1]}
184+
)
185+
assert response.status_code == 204, response.content
186+
mock_backfill_seer_grouping_records.assert_called_with(
187+
current_project_id=self.project.id,
188+
last_processed_group_id_input=8,
189+
only_delete=False,
190+
enable_ingestion=False,
191+
skip_processed_projects=False,
192+
skip_project_ids=[1],
135193
)

tests/sentry/tasks/test_backfill_seer_grouping_records.py

Lines changed: 120 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import copy
2+
import time
23
from collections.abc import Mapping
34
from datetime import UTC, datetime, timedelta
45
from random import choice
@@ -1519,10 +1520,9 @@ def test_backfill_seer_grouping_records_enable_ingestion(
15191520
assert self.project.get_option("sentry:similarity_backfill_completed") is not None
15201521

15211522
@with_feature("projects:similarity-embeddings-backfill")
1522-
@patch("sentry.tasks.embeddings_grouping.utils.logger")
15231523
@patch("sentry.tasks.embeddings_grouping.utils.post_bulk_grouping_records")
15241524
def test_backfill_seer_grouping_records_no_enable_ingestion(
1525-
self, mock_post_bulk_grouping_records, mock_logger
1525+
self, mock_post_bulk_grouping_records
15261526
):
15271527
"""
15281528
Test that when the enable_ingestion flag is False, the project option is not set.
@@ -1540,3 +1540,121 @@ def test_backfill_seer_grouping_records_no_enable_ingestion(
15401540
}
15411541

15421542
assert self.project.get_option("sentry:similarity_backfill_completed") is None
1543+
1544+
@with_feature("projects:similarity-embeddings-backfill")
1545+
@patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
1546+
def test_backfill_seer_grouping_records_skip_project_already_processed(self, mock_logger):
1547+
"""
1548+
Test that projects that have a backfill completed project option are skipped when passed
1549+
the skip_processed_projects flag.
1550+
"""
1551+
self.project.update_option("sentry:similarity_backfill_completed", int(time.time()))
1552+
with TaskRunner():
1553+
backfill_seer_grouping_records_for_project(
1554+
self.project.id, None, skip_processed_projects=True
1555+
)
1556+
1557+
expected_call_args_list = [
1558+
call(
1559+
"backfill_seer_grouping_records",
1560+
extra={
1561+
"current_project_id": self.project.id,
1562+
"last_processed_group_id": None,
1563+
"cohort": None,
1564+
"last_processed_project_index": None,
1565+
"only_delete": False,
1566+
},
1567+
),
1568+
call(
1569+
"backfill_seer_grouping_records.project_skipped",
1570+
extra={
1571+
"project_id": self.project.id,
1572+
"project_already_processed": True,
1573+
"project_manually_skipped": None,
1574+
},
1575+
),
1576+
call("backfill finished, no cohort", extra={"project_id": self.project.id}),
1577+
]
1578+
assert mock_logger.info.call_args_list == expected_call_args_list
1579+
1580+
@with_feature("projects:similarity-embeddings-backfill")
1581+
@patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
1582+
@patch("sentry.tasks.embeddings_grouping.utils.post_bulk_grouping_records")
1583+
def test_backfill_seer_grouping_records_reprocess_project_already_processed(
1584+
self, mock_post_bulk_grouping_records, mock_logger
1585+
):
1586+
"""
1587+
Test that projects that have a backfill completed project option are not skipped when not
1588+
passed the skip_processed_projects flag.
1589+
"""
1590+
mock_post_bulk_grouping_records.return_value = {"success": True, "groups_with_neighbor": {}}
1591+
self.project.update_option("sentry:similarity_backfill_completed", int(time.time()))
1592+
with TaskRunner():
1593+
backfill_seer_grouping_records_for_project(self.project.id, None)
1594+
1595+
last_group_id = sorted(
1596+
[group.id for group in Group.objects.filter(project_id=self.project.id)]
1597+
)[0]
1598+
expected_call_args_list = [
1599+
call(
1600+
"backfill_seer_grouping_records",
1601+
extra={
1602+
"current_project_id": self.project.id,
1603+
"last_processed_group_id": None,
1604+
"cohort": None,
1605+
"last_processed_project_index": None,
1606+
"only_delete": False,
1607+
},
1608+
),
1609+
call("about to call next backfill", extra={"project_id": self.project.id}),
1610+
call(
1611+
"calling next backfill task",
1612+
extra={"project_id": self.project.id, "last_processed_group_id": last_group_id},
1613+
),
1614+
call(
1615+
"backfill_seer_grouping_records",
1616+
extra={
1617+
"current_project_id": self.project.id,
1618+
"last_processed_group_id": last_group_id,
1619+
"cohort": None,
1620+
"last_processed_project_index": 0,
1621+
"only_delete": False,
1622+
},
1623+
),
1624+
call("backfill finished, no cohort", extra={"project_id": self.project.id}),
1625+
]
1626+
assert mock_logger.info.call_args_list == expected_call_args_list
1627+
1628+
@with_feature("projects:similarity-embeddings-backfill")
1629+
@patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
1630+
def test_backfill_seer_grouping_records_manually_skip_project(self, mock_logger):
1631+
"""
1632+
Test that project ids that are included in the skip_project_ids field are skipped.
1633+
"""
1634+
with TaskRunner():
1635+
backfill_seer_grouping_records_for_project(
1636+
self.project.id, None, skip_project_ids=[self.project.id]
1637+
)
1638+
1639+
expected_call_args_list = [
1640+
call(
1641+
"backfill_seer_grouping_records",
1642+
extra={
1643+
"current_project_id": self.project.id,
1644+
"last_processed_group_id": None,
1645+
"cohort": None,
1646+
"last_processed_project_index": None,
1647+
"only_delete": False,
1648+
},
1649+
),
1650+
call(
1651+
"backfill_seer_grouping_records.project_skipped",
1652+
extra={
1653+
"project_id": self.project.id,
1654+
"project_already_processed": False,
1655+
"project_manually_skipped": True,
1656+
},
1657+
),
1658+
call("backfill finished, no cohort", extra={"project_id": self.project.id}),
1659+
]
1660+
assert mock_logger.info.call_args_list == expected_call_args_list

0 commit comments

Comments
 (0)