Skip to content

Commit c32d720

Browse files
authored
Persistent contexts (#94)
1 parent c53786c commit c32d720

File tree

6 files changed

+237
-92
lines changed

6 files changed

+237
-92
lines changed

README.md

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,18 +78,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
7878
It should be a mapping of (name, keyword arguments). For instance:
7979
```python
8080
{
81-
"first": {
81+
"foobar": {
8282
"context_arg1": "value",
8383
"context_arg2": "value",
8484
},
85-
"second": {
85+
"default": {
86+
"context_arg1": "value",
87+
"context_arg2": "value",
88+
},
89+
"persistent": {
90+
"user_data_dir": "/path/to/dir", # will be a persistent context
8691
"context_arg1": "value",
8792
},
8893
}
8994
```
90-
A default context (called `default`) is created if no contexts are defined,
91-
this will be used by all requests which do not explicitly specify a context.
92-
See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
95+
96+
See the section on [Multiple browser contexts](#multiple-browser-contexts)
97+
for more information.
98+
99+
See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
93100

94101
* `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`)
95102

@@ -269,6 +276,17 @@ yield scrapy.Request(
269276
)
270277
```
271278

279+
### Default context
280+
281+
If a request does not explicitly indicate a context via the `playwright_context`
282+
meta key, it falls back to using a general context called `default`. This `default`
283+
context can also be customized on startup via the `PLAYWRIGHT_CONTEXTS` setting.
284+
285+
### Persistent contexts
286+
287+
Pass a value for the `user_data_dir` keyword argument to launch a context as
288+
**persistent** (see [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context)).
289+
272290
### Creating a context during a crawl
273291

274292
If the context specified in the `playwright_context` meta key does not exist, it will be created.

examples/persistent_context.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from pathlib import Path
2+
3+
from scrapy import Spider, Request
4+
5+
6+
class PersistentContextSpider(Spider):
7+
"""Use a persistent browser context"""
8+
9+
name = "persistent_context"
10+
custom_settings = {
11+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
12+
"DOWNLOAD_HANDLERS": {
13+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
14+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15+
},
16+
"PLAYWRIGHT_CONTEXTS": {
17+
"foobar": {
18+
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
19+
"java_script_enabled": False,
20+
"extra_http_headers": {"Asdf": "Qwerty"},
21+
"user_agent": "foobar",
22+
}
23+
},
24+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
25+
}
26+
27+
def start_requests(self):
28+
yield Request(
29+
url="https://httpbin.org/get",
30+
meta={"playwright": True, "playwright_context": "foobar"},
31+
)
32+
33+
def parse(self, response):
34+
content = response.css("pre::text").get()
35+
print(content)
36+
return {
37+
"url": response.url,
38+
"context": response.meta["playwright_context"],
39+
}

scrapy_playwright/handler.py

Lines changed: 105 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22
import logging
33
import warnings
44
from contextlib import suppress
5+
from dataclasses import dataclass
56
from ipaddress import ip_address
67
from time import time
78
from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
89

910
from playwright.async_api import (
11+
Browser,
1012
BrowserContext,
13+
BrowserType,
1114
Page,
1215
PlaywrightContextManager,
1316
Request as PlaywrightRequest,
@@ -41,32 +44,48 @@
4144
logger = logging.getLogger("scrapy-playwright")
4245

4346

47+
DEFAULT_BROWSER_TYPE = "chromium"
48+
DEFAULT_CONTEXT_NAME = "default"
49+
PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir"
50+
51+
52+
@dataclass
53+
class BrowserContextWrapper:
54+
context: BrowserContext
55+
semaphore: asyncio.Semaphore
56+
persistent: bool
57+
58+
4459
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
4560
def __init__(self, crawler: Crawler) -> None:
46-
super().__init__(settings=crawler.settings, crawler=crawler)
61+
settings = crawler.settings
62+
super().__init__(settings=settings, crawler=crawler)
4763
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
4864
crawler.signals.connect(self._engine_started, signals.engine_started)
4965
self.stats = crawler.stats
5066

51-
self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium"
52-
self.max_pages_per_context: int = crawler.settings.getint(
67+
self.browser_launch_lock = asyncio.Lock()
68+
self.context_launch_lock = asyncio.Lock()
69+
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
70+
self.max_pages_per_context: int = settings.getint(
5371
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
54-
) or crawler.settings.getint("CONCURRENT_REQUESTS")
55-
self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
72+
) or settings.getint("CONCURRENT_REQUESTS")
73+
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
5674

5775
self.default_navigation_timeout: Optional[float] = None
58-
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
76+
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
5977
with suppress(TypeError, ValueError):
6078
self.default_navigation_timeout = float(
61-
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
79+
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
6280
)
6381

64-
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
65-
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
66-
self.process_request_headers = None # use headers from the Playwright request
82+
# header-related settings
83+
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
84+
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
85+
self.process_request_headers = None
6786
else:
6887
self.process_request_headers = load_object(
69-
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
88+
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
7089
)
7190
if self.process_request_headers is use_playwright_headers:
7291
warnings.warn(
@@ -80,70 +99,94 @@ def __init__(self, crawler: Crawler) -> None:
8099
else:
81100
self.process_request_headers = use_scrapy_headers
82101

83-
self.context_kwargs: dict = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")
84-
self.contexts: Dict[str, BrowserContext] = {}
85-
self.context_semaphores: Dict[str, asyncio.Semaphore] = {}
102+
# context-related settings
103+
self.contexts: Dict[str, BrowserContextWrapper] = {}
104+
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
86105

87106
self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
88-
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
89-
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])
107+
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
108+
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
90109

91110
@classmethod
92111
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
93112
return cls(crawler)
94113

95114
def _engine_started(self) -> Deferred:
96115
"""Launch the browser. Use the engine_started signal as it supports returning deferreds."""
97-
return deferred_from_coro(self._launch_browser())
116+
return deferred_from_coro(self._launch())
98117

99-
async def _launch_browser(self) -> None:
118+
async def _launch(self) -> None:
119+
"""Launch Playwright manager and configured startup context(s)."""
120+
logger.info("Starting download handler")
100121
self.playwright_context_manager = PlaywrightContextManager()
101122
self.playwright = await self.playwright_context_manager.start()
102-
logger.info("Launching browser")
103-
browser_launcher = getattr(self.playwright, self.browser_type).launch
104-
self.browser = await browser_launcher(**self.launch_options)
105-
logger.info(f"Browser {self.browser_type} launched")
106-
contexts = await asyncio.gather(
107-
*[
108-
self._create_browser_context(name, kwargs)
109-
for name, kwargs in self.context_kwargs.items()
110-
]
111-
)
112-
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
113-
self.context_semaphores = {
114-
name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts
115-
}
116-
117-
async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext:
118-
context = await self.browser.new_context(**context_kwargs)
119-
context.on("close", self._make_close_browser_context_callback(name))
120-
logger.debug("Browser context started: '%s'", name)
123+
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
124+
if self.context_kwargs:
125+
logger.info(f"Launching {len(self.context_kwargs)} startup context(s)")
126+
contexts = await asyncio.gather(
127+
*[
128+
self._create_browser_context(name=name, context_kwargs=kwargs)
129+
for name, kwargs in self.context_kwargs.items()
130+
]
131+
)
132+
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
133+
logger.info("Startup context(s) launched")
134+
self.stats.set_value("playwright/page_count", self._get_total_page_count())
135+
136+
async def _maybe_launch_browser(self) -> None:
137+
async with self.browser_launch_lock:
138+
if not hasattr(self, "browser"):
139+
logger.info(f"Launching browser {self.browser_type.name}")
140+
self.browser: Browser = await self.browser_type.launch(**self.launch_options)
141+
logger.info(f"Browser {self.browser_type.name} launched")
142+
143+
async def _create_browser_context(
144+
self, name: str, context_kwargs: Optional[dict]
145+
) -> BrowserContextWrapper:
146+
"""Create a new context, also launching a browser if necessary."""
147+
context_kwargs = context_kwargs or {}
148+
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
149+
context = await self.browser_type.launch_persistent_context(**context_kwargs)
150+
persistent = True
151+
self.stats.inc_value("playwright/context_count/persistent")
152+
else:
153+
await self._maybe_launch_browser()
154+
context = await self.browser.new_context(**context_kwargs)
155+
persistent = False
156+
self.stats.inc_value("playwright/context_count/non-persistent")
157+
context.on("close", self._make_close_browser_context_callback(name, persistent))
158+
logger.debug(f"Browser context started: '{name}' (persistent={persistent})")
121159
self.stats.inc_value("playwright/context_count")
122160
if self.default_navigation_timeout is not None:
123161
context.set_default_navigation_timeout(self.default_navigation_timeout)
124-
return context
162+
return BrowserContextWrapper(
163+
context=context,
164+
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
165+
persistent=persistent,
166+
)
125167

126168
async def _create_page(self, request: Request) -> Page:
127169
"""Create a new page in a context, also creating a new context if necessary."""
128-
context_name = request.meta.setdefault("playwright_context", "default")
129-
context = self.contexts.get(context_name)
130-
if context is None:
131-
context_kwargs = request.meta.get("playwright_context_kwargs") or {}
132-
context = await self._create_browser_context(context_name, context_kwargs)
133-
self.contexts[context_name] = context
134-
self.context_semaphores[context_name] = asyncio.Semaphore(
135-
value=self.max_pages_per_context
136-
)
170+
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
171+
# this block needs to be locked because several attempts to launch a context
172+
# with the same name could happen at the same time from different requests
173+
async with self.context_launch_lock:
174+
context = self.contexts.get(context_name)
175+
if context is None:
176+
context = self.contexts[context_name] = await self._create_browser_context(
177+
name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
178+
)
137179

138-
await self.context_semaphores[context_name].acquire()
139-
page = await context.new_page()
180+
await context.semaphore.acquire()
181+
page = await context.context.new_page()
140182
self.stats.inc_value("playwright/page_count")
141183
logger.debug(
142184
"[Context=%s] New page created, page count is %i (%i for all contexts)",
143185
context_name,
144-
len(context.pages),
186+
len(context.context.pages),
145187
self._get_total_page_count(),
146188
)
189+
self._set_max_concurrent_page_count()
147190
if self.default_navigation_timeout is not None:
148191
page.set_default_navigation_timeout(self.default_navigation_timeout)
149192

@@ -157,21 +200,24 @@ async def _create_page(self, request: Request) -> Page:
157200
return page
158201

159202
def _get_total_page_count(self):
160-
count = sum([len(context.pages) for context in self.contexts.values()])
203+
return sum([len(ctx.context.pages) for ctx in self.contexts.values()])
204+
205+
def _set_max_concurrent_page_count(self):
206+
count = self._get_total_page_count()
161207
current_max_count = self.stats.get_value("playwright/page_count/max_concurrent")
162208
if current_max_count is None or count > current_max_count:
163209
self.stats.set_value("playwright/page_count/max_concurrent", count)
164-
return count
165210

166211
@inlineCallbacks
167212
def close(self) -> Deferred:
213+
logger.info("Closing download handler")
168214
yield super().close()
169215
yield deferred_from_coro(self._close())
170216

171217
async def _close(self) -> None:
218+
await asyncio.gather(*[ctx.context.close() for ctx in self.contexts.values()])
172219
self.contexts.clear()
173-
self.context_semaphores.clear()
174-
if getattr(self, "browser", None):
220+
if hasattr(self, "browser"):
175221
logger.info("Closing browser")
176222
await self.browser.close()
177223
await self.playwright_context_manager.__aexit__()
@@ -302,18 +348,16 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
302348

303349
def _make_close_page_callback(self, context_name: str) -> Callable:
304350
def close_page_callback() -> None:
305-
if context_name in self.context_semaphores:
306-
self.context_semaphores[context_name].release()
351+
if context_name in self.contexts:
352+
self.contexts[context_name].semaphore.release()
307353

308354
return close_page_callback
309355

310-
def _make_close_browser_context_callback(self, name: str) -> Callable:
356+
def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable:
311357
def close_browser_context_callback() -> None:
312-
logger.debug("Browser context closed: '%s'", name)
358+
logger.debug(f"Browser context closed: '{name}' (persistent={persistent})")
313359
if name in self.contexts:
314360
self.contexts.pop(name)
315-
if name in self.context_semaphores:
316-
self.context_semaphores.pop(name)
317361

318362
return close_browser_context_callback
319363

@@ -334,7 +378,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
334378
if self.process_request_headers is not None:
335379
overrides["headers"] = await _maybe_await(
336380
self.process_request_headers(
337-
self.browser_type, playwright_request, scrapy_headers
381+
self.browser_type_name, playwright_request, scrapy_headers
338382
)
339383
)
340384
# the request that reaches the callback should contain the final headers

tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ async def make_handler(settings_dict: dict):
1111
crawler = get_crawler(settings_dict=settings_dict)
1212
handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
1313
try:
14-
await handler._launch_browser()
14+
await handler._launch()
1515
except: # noqa (E722), pylint: disable=bare-except
1616
pass
1717
else:

0 commit comments

Comments
 (0)