Skip to content

Commit 2e66b9e

Browse files
authored
Limit concurrent context count (PLAYWRIGHT_MAX_CONTEXTS setting) (#95)
1 parent c32d720 commit 2e66b9e

File tree

5 files changed

+134
-98
lines changed

5 files changed

+134
-98
lines changed

README.md

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
6060
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
6161
```
6262

63-
## Settings
64-
65-
`scrapy-playwright` accepts the following settings:
63+
## Supported settings
6664

6765
* `PLAYWRIGHT_BROWSER_TYPE` (type `str`, default `chromium`)
6866
The browser type to be launched, e.g. `chromium`, `firefox`, `webkit`.
@@ -93,11 +91,16 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
9391
}
9492
```
9593

96-
See the section on [Multiple browser contexts](#multiple-browser-contexts)
97-
for more information.
94+
See the section on [browser contexts](#browser-contexts) for more information.
9895

9996
See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
10097

98+
* `PLAYWRIGHT_MAX_CONTEXTS` (type `Optional[int]`, default `None`)
99+
100+
Maximum amount of allowed concurrent Playwright contexts. If unset or `None`,
101+
no limit is enforced. See the [Maximum concurrent context count](#maximum-concurrent-context-count)
102+
section for more information.
103+
101104
* `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`)
102105

103106
The timeout used when requesting pages by Playwright. If `None` or unset,
@@ -260,7 +263,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
260263
Scrapy request workflow (Scheduler, Middlewares, etc).
261264

262265

263-
## Multiple browser contexts
266+
## Browser contexts
264267

265268
Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts)
266269
to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings).
@@ -327,6 +330,7 @@ def parse(self, response):
327330
yield scrapy.Request(
328331
url="https://example.org",
329332
callback=self.parse_in_new_context,
333+
errback=self.close_context_on_error,
330334
meta={"playwright": True, "playwright_context": "new", "playwright_include_page": True},
331335
)
332336

@@ -336,8 +340,21 @@ async def parse_in_new_context(self, response):
336340
await page.context.close() # close the context
337341
await page.close()
338342
return {"title": title}
343+
344+
async def close_context_on_error(self, failure):
345+
page = failure.request.meta["playwright_page"]
346+
await page.context.close()
339347
```
340348

349+
### Maximum concurrent context count
350+
351+
Specify a value for the `PLAYWRIGHT_MAX_CONTEXTS` setting to limit the amount
352+
of concurent contexts. This setting should be used with caution: it's possible
353+
to block the whole crawl if contexts are not closed after they are no longer
354+
used (refer to the above section to dinamically close contexts). Make sure to
355+
define an errback to still be able to close the context even if there are
356+
errors with a request.
357+
341358

342359
## Proxy support
343360

examples/contexts.py

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,25 @@
1+
from pathlib import Path
2+
13
from scrapy import Spider, Request
2-
from scrapy.crawler import CrawlerProcess
34

45

56
class MultipleContextsSpider(Spider):
67
"""Handle multiple browser contexts"""
78

89
name = "contexts"
910
custom_settings = {
11+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
12+
"DOWNLOAD_HANDLERS": {
13+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
14+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15+
},
16+
"PLAYWRIGHT_MAX_CONTEXTS": 6,
1017
"PLAYWRIGHT_CONTEXTS": {
1118
"first": {
1219
"storage_state": {
1320
"cookies": [
1421
{
15-
"url": "https://httpbin.org/headers",
22+
"url": "https://example.org",
1623
"name": "context",
1724
"value": "first",
1825
},
@@ -23,47 +30,43 @@ class MultipleContextsSpider(Spider):
2330
"storage_state": {
2431
"cookies": [
2532
{
26-
"url": "https://httpbin.org/headers",
33+
"url": "https://example.org",
2734
"name": "context",
2835
"value": "second",
2936
},
3037
],
3138
},
3239
},
40+
"persistent": {
41+
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
42+
"java_script_enabled": False,
43+
},
3344
},
3445
}
3546

3647
def start_requests(self):
3748
# using existing contexts
38-
yield Request(
39-
url="https://httpbin.org/headers",
40-
meta={
41-
"playwright": True,
42-
"playwright_context": "first",
43-
"playwright_include_page": True,
44-
},
45-
dont_filter=True,
46-
)
47-
yield Request(
48-
url="https://httpbin.org/headers",
49-
meta={
50-
"playwright": True,
51-
"playwright_context": "second",
52-
"playwright_include_page": True,
53-
},
54-
dont_filter=True,
55-
)
49+
for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys():
50+
yield Request(
51+
url="https://example.org",
52+
meta={
53+
"playwright": True,
54+
"playwright_context": ctx_name,
55+
"playwright_include_page": True,
56+
},
57+
dont_filter=True,
58+
)
5659
# create a new context
5760
yield Request(
58-
url="https://httpbin.org/headers",
61+
url="https://example.org",
5962
meta={
6063
"playwright": True,
6164
"playwright_context": "third",
6265
"playwright_context_kwargs": {
6366
"storage_state": {
6467
"cookies": [
6568
{
66-
"url": "https://httpbin.org/headers",
69+
"url": "https://example.org",
6770
"name": "context",
6871
"value": "third",
6972
},
@@ -76,10 +79,21 @@ def start_requests(self):
7679
)
7780
# default context
7881
yield Request(
79-
url="https://httpbin.org/headers",
82+
url="https://example.org",
8083
meta={"playwright": True, "playwright_include_page": True},
8184
dont_filter=True,
8285
)
86+
# each request on a different context
87+
for i in range(20):
88+
yield Request(
89+
url=f"https://example.org?foo={i}",
90+
meta={
91+
"playwright": True,
92+
"playwright_context": f"context-{i}",
93+
"playwright_include_page": True,
94+
},
95+
dont_filter=True,
96+
)
8397

8498
async def parse(self, response):
8599
page = response.meta["playwright_page"]
@@ -91,17 +105,3 @@ async def parse(self, response):
91105
"context": context_name,
92106
"cookies": storage_state["cookies"],
93107
}
94-
95-
96-
if __name__ == "__main__":
97-
process = CrawlerProcess(
98-
settings={
99-
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
100-
"DOWNLOAD_HANDLERS": {
101-
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
102-
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
103-
},
104-
}
105-
)
106-
process.crawl(MultipleContextsSpider)
107-
process.start()

examples/persistent_context.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

scrapy_playwright/handler.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,22 @@ def __init__(self, crawler: Crawler) -> None:
6464
crawler.signals.connect(self._engine_started, signals.engine_started)
6565
self.stats = crawler.stats
6666

67-
self.browser_launch_lock = asyncio.Lock()
68-
self.context_launch_lock = asyncio.Lock()
67+
# browser
6968
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
69+
self.browser_launch_lock = asyncio.Lock()
70+
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
71+
72+
# contexts
7073
self.max_pages_per_context: int = settings.getint(
7174
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
7275
) or settings.getint("CONCURRENT_REQUESTS")
73-
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
76+
self.context_launch_lock = asyncio.Lock()
77+
self.contexts: Dict[str, BrowserContextWrapper] = {}
78+
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
79+
if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
80+
self.context_semaphore = asyncio.Semaphore(
81+
value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS")
82+
)
7483

7584
self.default_navigation_timeout: Optional[float] = None
7685
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
@@ -79,7 +88,7 @@ def __init__(self, crawler: Crawler) -> None:
7988
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
8089
)
8190

82-
# header-related settings
91+
# headers
8392
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
8493
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
8594
self.process_request_headers = None
@@ -99,10 +108,6 @@ def __init__(self, crawler: Crawler) -> None:
99108
else:
100109
self.process_request_headers = use_scrapy_headers
101110

102-
# context-related settings
103-
self.contexts: Dict[str, BrowserContextWrapper] = {}
104-
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
105-
106111
self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
107112
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
108113
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
@@ -123,13 +128,13 @@ async def _launch(self) -> None:
123128
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
124129
if self.context_kwargs:
125130
logger.info(f"Launching {len(self.context_kwargs)} startup context(s)")
126-
contexts = await asyncio.gather(
131+
await asyncio.gather(
127132
*[
128133
self._create_browser_context(name=name, context_kwargs=kwargs)
129134
for name, kwargs in self.context_kwargs.items()
130135
]
131136
)
132-
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
137+
self._set_max_concurrent_context_count()
133138
logger.info("Startup context(s) launched")
134139
self.stats.set_value("playwright/page_count", self._get_total_page_count())
135140

@@ -144,6 +149,8 @@ async def _create_browser_context(
144149
self, name: str, context_kwargs: Optional[dict]
145150
) -> BrowserContextWrapper:
146151
"""Create a new context, also launching a browser if necessary."""
152+
if hasattr(self, "context_semaphore"):
153+
await self.context_semaphore.acquire()
147154
context_kwargs = context_kwargs or {}
148155
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
149156
context = await self.browser_type.launch_persistent_context(**context_kwargs)
@@ -159,11 +166,13 @@ async def _create_browser_context(
159166
self.stats.inc_value("playwright/context_count")
160167
if self.default_navigation_timeout is not None:
161168
context.set_default_navigation_timeout(self.default_navigation_timeout)
162-
return BrowserContextWrapper(
169+
self.contexts[name] = BrowserContextWrapper(
163170
context=context,
164171
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
165172
persistent=persistent,
166173
)
174+
self._set_max_concurrent_context_count()
175+
return self.contexts[name]
167176

168177
async def _create_page(self, request: Request) -> Page:
169178
"""Create a new page in a context, also creating a new context if necessary."""
@@ -173,7 +182,7 @@ async def _create_page(self, request: Request) -> Page:
173182
async with self.context_launch_lock:
174183
context = self.contexts.get(context_name)
175184
if context is None:
176-
context = self.contexts[context_name] = await self._create_browser_context(
185+
context = await self._create_browser_context(
177186
name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
178187
)
179188

@@ -208,6 +217,11 @@ def _set_max_concurrent_page_count(self):
208217
if current_max_count is None or count > current_max_count:
209218
self.stats.set_value("playwright/page_count/max_concurrent", count)
210219

220+
def _set_max_concurrent_context_count(self):
221+
current_max_count = self.stats.get_value("playwright/context_count/max_concurrent")
222+
if current_max_count is None or len(self.contexts) > current_max_count:
223+
self.stats.set_value("playwright/context_count/max_concurrent", len(self.contexts))
224+
211225
@inlineCallbacks
212226
def close(self) -> Deferred:
213227
logger.info("Closing download handler")
@@ -355,9 +369,10 @@ def close_page_callback() -> None:
355369

356370
def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable:
357371
def close_browser_context_callback() -> None:
372+
self.contexts.pop(name, None)
373+
if hasattr(self, "context_semaphore"):
374+
self.context_semaphore.release()
358375
logger.debug(f"Browser context closed: '{name}' (persistent={persistent})")
359-
if name in self.contexts:
360-
self.contexts.pop(name)
361376

362377
return close_browser_context_callback
363378

0 commit comments

Comments
 (0)