Skip to content

Commit 93b2bfd

Browse files
committed
Pause crawls instead of stopping when quotas are reached
- Backend implementation with new crawl pause states: paused_storage_quota_reached, paused_time_quota_reached, paused_org_readonly - Send an email to all org admins when crawl is auto-paused - Frontend updates Partially dependent on crawler changes introduced in webrecorder/browsertrix-crawler#919
1 parent 72e6332 commit 93b2bfd

File tree

19 files changed

+553
-118
lines changed

19 files changed

+553
-118
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID):
430430
"""Delete crawl files for failed crawl"""
431431
crawl = await self.get_base_crawl(crawl_id)
432432
org = await self.orgs.get_org_by_id(oid)
433-
await self._delete_crawl_files(crawl, org)
433+
deleted_file_size = await self._delete_crawl_files(crawl, org)
434434
await self.crawls.find_one_and_update(
435435
{"_id": crawl_id, "oid": oid},
436436
{
@@ -441,6 +441,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID):
441441
}
442442
},
443443
)
444+
await self.orgs.inc_org_bytes_stored(oid, -deleted_file_size, "crawl")
444445

445446
async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization):
446447
"""Delete files for all qa runs in a crawl"""

backend/btrixcloud/crawls.py

Lines changed: 85 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import urllib.parse
1010
from datetime import datetime
1111
from uuid import UUID
12+
import asyncio
1213

1314
from typing import (
1415
Annotated,
@@ -79,6 +80,8 @@
7980
MatchCrawlQueueResponse,
8081
CrawlLogLine,
8182
TagsResponse,
83+
TYPE_AUTO_PAUSED_STATES,
84+
UserRole,
8285
)
8386

8487

@@ -93,7 +96,12 @@ class CrawlOps(BaseCrawlOps):
9396

9497
crawl_manager: CrawlManager
9598

96-
def __init__(self, crawl_manager: CrawlManager, log_ops: CrawlLogOps, *args):
99+
def __init__(
100+
self,
101+
crawl_manager: CrawlManager,
102+
log_ops: CrawlLogOps,
103+
*args,
104+
):
97105
super().__init__(*args)
98106
self.crawl_manager = crawl_manager
99107
self.log_ops = log_ops
@@ -357,12 +365,12 @@ async def get_active_crawls(self, oid: UUID, limit: int) -> list[str]:
357365
res_list = await res.to_list()
358366
return [res["_id"] for res in res_list]
359367

360-
async def get_active_crawls_size(self, oid: UUID) -> int:
361-
"""get size of all active (running, waiting, paused) crawls"""
368+
async def get_active_crawls_pending_size(self, oid: UUID) -> int:
369+
"""get pending size of all active (running, waiting, paused) crawls"""
362370
cursor = self.crawls.aggregate(
363371
[
364372
{"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}},
365-
{"$group": {"_id": None, "totalSum": {"$sum": "$stats.size"}}},
373+
{"$group": {"_id": None, "totalSum": {"$sum": "$pendingSize"}}},
366374
]
367375
)
368376
results = await cursor.to_list(length=1)
@@ -647,14 +655,16 @@ async def update_crawl_state_if_allowed(
647655
return res is not None
648656

649657
async def update_running_crawl_stats(
650-
self, crawl_id: str, is_qa: bool, stats: CrawlStats
658+
self, crawl_id: str, is_qa: bool, stats: CrawlStats, pending_size: int
651659
) -> bool:
652660
"""update running crawl stats"""
653661
prefix = "" if not is_qa else "qa."
654662
query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"}
655-
res = await self.crawls.find_one_and_update(
656-
query, {"$set": {f"{prefix}stats": stats.dict()}}
657-
)
663+
update: dict[str, dict | int] = {f"{prefix}stats": stats.dict()}
664+
if not is_qa:
665+
update["pendingSize"] = pending_size
666+
667+
res = await self.crawls.find_one_and_update(query, {"$set": update})
658668
return res is not None
659669

660670
async def inc_crawl_exec_time(
@@ -812,7 +822,11 @@ async def get_crawl_stats(
812822
return crawls_data
813823

814824
async def pause_crawl(
815-
self, crawl_id: str, org: Organization, pause: bool
825+
self,
826+
crawl_id: str,
827+
org: Organization,
828+
pause: bool,
829+
paused_at: Optional[datetime] = None,
816830
) -> Dict[str, bool]:
817831
"""pause or resume a crawl temporarily"""
818832
crawl = await self.get_base_crawl(crawl_id, org)
@@ -821,10 +835,13 @@ async def pause_crawl(
821835

822836
result = None
823837

824-
if pause:
838+
if pause and not paused_at:
825839
paused_at = dt_now()
826-
else:
827-
paused_at = None
840+
841+
if not pause:
842+
# If unpausing, unset autoPausedEmailsSent so that we will send
843+
# emails again if quota is reached
844+
await self.set_auto_paused_emails_sent(crawl_id, org, False)
828845

829846
try:
830847
result = await self.crawl_manager.pause_resume_crawl(
@@ -1195,6 +1212,57 @@ async def get_crawl_logs(
11951212
qa_run_id=qa_run_id,
11961213
)
11971214

1215+
async def notify_org_admins_of_auto_paused_crawl(
1216+
self,
1217+
paused_reason: TYPE_AUTO_PAUSED_STATES,
1218+
crawl_id: str,
1219+
cid: UUID,
1220+
org: Organization,
1221+
):
1222+
"""Send email to all org admins about automatically paused crawl"""
1223+
if await self.get_auto_paused_emails_sent(crawl_id, org):
1224+
return
1225+
1226+
users = await self.orgs.get_users_for_org(org, UserRole.OWNER)
1227+
workflow = await self.crawl_configs.get_crawl_config_out(cid, org)
1228+
1229+
await asyncio.gather(
1230+
*[
1231+
self.user_manager.email.send_crawl_auto_paused(
1232+
user.name,
1233+
user.email,
1234+
paused_reason,
1235+
workflow.lastCrawlPausedExpiry,
1236+
cid,
1237+
org,
1238+
)
1239+
for user in users
1240+
]
1241+
)
1242+
1243+
await self.set_auto_paused_emails_sent(crawl_id, org)
1244+
1245+
async def set_auto_paused_emails_sent(
1246+
self, crawl_id: str, org: Organization, emails_sent: bool = True
1247+
):
1248+
"""Set if auto-paused emails already sent"""
1249+
await self.crawls.find_one_and_update(
1250+
{"_id": crawl_id, "oid": org.id, "type": "crawl"},
1251+
{"$set": {"autoPausedEmailsSent": emails_sent}},
1252+
)
1253+
1254+
async def get_auto_paused_emails_sent(
1255+
self, crawl_id: str, org: Organization
1256+
) -> bool:
1257+
"""Return whether auto-paused emails already sent for crawl"""
1258+
res = await self.crawls.find_one(
1259+
{"_id": crawl_id, "oid": org.id, "type": "crawl"},
1260+
projection=["autoPausedEmailsSent"],
1261+
)
1262+
if res:
1263+
return res.get("autoPausedEmailsSent", False)
1264+
return False
1265+
11981266

11991267
# ============================================================================
12001268
async def recompute_crawl_file_count_and_size(crawls, crawl_id: str):
@@ -1217,7 +1285,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str):
12171285
# ============================================================================
12181286
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
12191287
def init_crawls_api(
1220-
crawl_manager: CrawlManager, crawl_log_ops: CrawlLogOps, app, user_dep, *args
1288+
crawl_manager: CrawlManager,
1289+
crawl_log_ops: CrawlLogOps,
1290+
app,
1291+
user_dep,
1292+
*args,
12211293
):
12221294
"""API for crawl management, including crawl done callback"""
12231295
# pylint: disable=invalid-name, duplicate-code

backend/btrixcloud/emailsender.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
Organization,
2121
InvitePending,
2222
Subscription,
23+
TYPE_AUTO_PAUSED_STATES,
2324
)
2425
from .utils import is_bool, get_origin
2526

@@ -250,3 +251,31 @@ async def send_subscription_trial_ending_soon(
250251
behavior_on_trial_end=behavior_on_trial_end,
251252
support_email=self.support_email,
252253
)
254+
255+
async def send_crawl_auto_paused(
256+
self,
257+
user_name: str,
258+
receiver_email: str,
259+
paused_reason: TYPE_AUTO_PAUSED_STATES,
260+
paused_expiry: datetime,
261+
cid: UUID,
262+
org: Organization,
263+
headers=None,
264+
):
265+
"""Send email indicating crawl was paused due to quota or disabled crawling"""
266+
267+
origin = get_origin(headers)
268+
org_url = f"{origin}/orgs/{org.slug}"
269+
workflow_url = f"{org_url}/workflows/{cid}/latest"
270+
271+
await self._send_encrypted(
272+
receiver_email,
273+
"crawlAutoPaused",
274+
org_name=org.name,
275+
user_name=user_name,
276+
paused_reason=paused_reason,
277+
paused_expiry=paused_expiry.isoformat(),
278+
org_url=org_url,
279+
workflow_url=workflow_url,
280+
support_email=self.support_email,
281+
)

backend/btrixcloud/models.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,30 @@ class UserOrgInfoOut(BaseModel):
237237
]
238238
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
239239

240-
TYPE_WAITING_STATES = Literal[
241-
"starting", "waiting_capacity", "waiting_org_limit", "paused"
240+
TYPE_MANUALLY_PAUSED_STATES = Literal["paused"]
241+
242+
TYPE_AUTO_PAUSED_STATES = Literal[
243+
"paused_storage_quota_reached",
244+
"paused_time_quota_reached",
245+
"paused_org_readonly",
246+
]
247+
AUTO_PAUSED_STATES = get_args(TYPE_AUTO_PAUSED_STATES)
248+
249+
TYPE_PAUSED_STATES = Literal[
250+
TYPE_MANUALLY_PAUSED_STATES,
251+
TYPE_AUTO_PAUSED_STATES,
252+
]
253+
PAUSED_STATES = get_args(TYPE_PAUSED_STATES)
254+
255+
TYPE_WAITING_NOT_PAUSED_STATES = Literal[
256+
"starting",
257+
"waiting_capacity",
258+
"waiting_org_limit",
242259
]
243-
WAITING_STATES = get_args(TYPE_WAITING_STATES)
260+
WAITING_NOT_PAUSED_STATES = get_args(TYPE_WAITING_NOT_PAUSED_STATES)
261+
262+
TYPE_WAITING_STATES = Literal[TYPE_PAUSED_STATES, TYPE_WAITING_NOT_PAUSED_STATES]
263+
WAITING_STATES = [*PAUSED_STATES, *WAITING_NOT_PAUSED_STATES]
244264

245265
TYPE_FAILED_STATES = Literal[
246266
"canceled",
@@ -260,7 +280,7 @@ class UserOrgInfoOut(BaseModel):
260280
"stopped_org_readonly",
261281
]
262282
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
263-
SUCCESSFUL_AND_PAUSED_STATES = ["paused", *SUCCESSFUL_STATES]
283+
SUCCESSFUL_AND_PAUSED_STATES = [*PAUSED_STATES, *SUCCESSFUL_STATES]
264284

265285
TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES]
266286
RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES]
@@ -284,8 +304,6 @@ class CrawlStats(BaseModel):
284304
done: int = 0
285305
size: int = 0
286306

287-
profile_update: Optional[str] = ""
288-
289307

290308
# ============================================================================
291309

@@ -887,6 +905,7 @@ class CrawlOut(BaseMongoModel):
887905

888906
fileSize: int = 0
889907
fileCount: int = 0
908+
pendingSize: int = 0
890909

891910
tags: Optional[List[str]] = []
892911

@@ -1071,6 +1090,10 @@ class Crawl(BaseCrawl, CrawlConfigCore):
10711090
qa: Optional[QARun] = None
10721091
qaFinished: Optional[Dict[str, QARun]] = {}
10731092

1093+
pendingSize: int = 0
1094+
1095+
autoPausedEmailsSent: bool = False
1096+
10741097

10751098
# ============================================================================
10761099
class CrawlCompleteIn(BaseModel):

0 commit comments

Comments
 (0)