Skip to content

Commit 5b254e4

Browse files
authored
Configuration class (#248)
* Config class * Do not delete context kwargs after init * Fix attribute path
1 parent 1e7b2f6 commit 5b254e4

File tree

5 files changed

+102
-107
lines changed

5 files changed

+102
-107
lines changed

scrapy_playwright/handler.py

Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from scrapy.http import Request, Response
2424
from scrapy.http.headers import Headers
2525
from scrapy.responsetypes import responsetypes
26+
from scrapy.settings import Settings
2627
from scrapy.utils.defer import deferred_from_coro
2728
from scrapy.utils.misc import load_object
2829
from scrapy.utils.reactor import verify_installed_reactor
@@ -60,57 +61,68 @@ class BrowserContextWrapper:
6061
persistent: bool
6162

6263

64+
@dataclass
65+
class Config:
66+
cdp_url: Optional[str]
67+
cdp_kwargs: dict
68+
browser_type_name: str
69+
launch_options: dict
70+
max_pages_per_context: int
71+
max_contexts: Optional[int]
72+
startup_context_kwargs: dict
73+
navigation_timeout: Optional[float] = None
74+
75+
@classmethod
76+
def from_settings(cls, settings: Settings) -> "Config":
77+
cfg = cls(
78+
cdp_url=settings.get("PLAYWRIGHT_CDP_URL"),
79+
cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {},
80+
browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE,
81+
launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {},
82+
max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"),
83+
max_contexts=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or None,
84+
startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"),
85+
)
86+
cfg.cdp_kwargs.pop("endpoint_url", None)
87+
if not cfg.max_pages_per_context:
88+
cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS")
89+
if cfg.cdp_url and cfg.launch_options:
90+
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
91+
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
92+
with suppress(TypeError, ValueError):
93+
cfg.navigation_timeout = float(settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"])
94+
return cfg
95+
96+
6397
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
6498
def __init__(self, crawler: Crawler) -> None:
65-
settings = crawler.settings
66-
super().__init__(settings=settings, crawler=crawler)
99+
super().__init__(settings=crawler.settings, crawler=crawler)
67100
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
68101
crawler.signals.connect(self._engine_started, signals.engine_started)
69102
self.stats = crawler.stats
70103

71-
# browser
72-
self.browser_cdp_url = settings.get("PLAYWRIGHT_CDP_URL")
73-
self.browser_cdp_kwargs = settings.get("PLAYWRIGHT_CDP_KWARGS") or {}
74-
self.browser_cdp_kwargs.pop("endpoint_url", None)
75-
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
76-
self.browser_launch_lock = asyncio.Lock()
77-
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
78-
if self.browser_cdp_url and self.launch_options:
79-
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
104+
self.config = Config.from_settings(crawler.settings)
80105

81-
# contexts
82-
self.max_pages_per_context: int = settings.getint(
83-
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
84-
) or settings.getint("CONCURRENT_REQUESTS")
106+
self.browser_launch_lock = asyncio.Lock()
85107
self.context_launch_lock = asyncio.Lock()
86108
self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
87-
self.startup_context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
88-
if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
89-
self.context_semaphore = asyncio.Semaphore(
90-
value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS")
91-
)
92-
93-
self.default_navigation_timeout: Optional[float] = None
94-
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
95-
with suppress(TypeError, ValueError):
96-
self.default_navigation_timeout = float(
97-
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
98-
)
109+
if self.config.max_contexts:
110+
self.context_semaphore = asyncio.Semaphore(value=self.config.max_contexts)
99111

100112
# headers
101-
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
102-
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
113+
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
114+
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
103115
self.process_request_headers = None
104116
else:
105117
self.process_request_headers = load_object(
106-
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
118+
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
107119
)
108120
else:
109121
self.process_request_headers = use_scrapy_headers
110122

111123
self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
112-
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
113-
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
124+
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
125+
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])
114126

115127
@classmethod
116128
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
@@ -125,35 +137,34 @@ async def _launch(self) -> None:
125137
logger.info("Starting download handler")
126138
self.playwright_context_manager = PlaywrightContextManager()
127139
self.playwright = await self.playwright_context_manager.start()
128-
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
129-
if self.startup_context_kwargs:
130-
logger.info("Launching %i startup context(s)", len(self.startup_context_kwargs))
140+
self.browser_type: BrowserType = getattr(self.playwright, self.config.browser_type_name)
141+
if self.config.startup_context_kwargs:
142+
logger.info("Launching %i startup context(s)", len(self.config.startup_context_kwargs))
131143
await asyncio.gather(
132144
*[
133145
self._create_browser_context(name=name, context_kwargs=kwargs)
134-
for name, kwargs in self.startup_context_kwargs.items()
146+
for name, kwargs in self.config.startup_context_kwargs.items()
135147
]
136148
)
137149
self._set_max_concurrent_context_count()
138150
logger.info("Startup context(s) launched")
139151
self.stats.set_value("playwright/page_count", self._get_total_page_count())
140-
del self.startup_context_kwargs
141152

142153
async def _maybe_launch_browser(self) -> None:
143154
async with self.browser_launch_lock:
144155
if not hasattr(self, "browser"):
145156
logger.info("Launching browser %s", self.browser_type.name)
146-
self.browser = await self.browser_type.launch(**self.launch_options)
157+
self.browser = await self.browser_type.launch(**self.config.launch_options)
147158
logger.info("Browser %s launched", self.browser_type.name)
148159

149160
async def _maybe_connect_devtools(self) -> None:
150161
async with self.browser_launch_lock:
151162
if not hasattr(self, "browser"):
152-
logger.info("Connecting using CDP: %s", self.browser_cdp_url)
163+
logger.info("Connecting using CDP: %s", self.config.cdp_url)
153164
self.browser = await self.browser_type.connect_over_cdp(
154-
self.browser_cdp_url, **self.browser_cdp_kwargs
165+
self.config.cdp_url, **self.config.cdp_kwargs
155166
)
156-
logger.info("Connected using CDP: %s", self.browser_cdp_url)
167+
logger.info("Connected using CDP: %s", self.config.cdp_url)
157168

158169
async def _create_browser_context(
159170
self,
@@ -171,7 +182,7 @@ async def _create_browser_context(
171182
context = await self.browser_type.launch_persistent_context(**context_kwargs)
172183
persistent = True
173184
remote = False
174-
elif self.browser_cdp_url:
185+
elif self.config.cdp_url:
175186
await self._maybe_connect_devtools()
176187
context = await self.browser.new_context(**context_kwargs)
177188
persistent = False
@@ -200,11 +211,11 @@ async def _create_browser_context(
200211
"remote": remote,
201212
},
202213
)
203-
if self.default_navigation_timeout is not None:
204-
context.set_default_navigation_timeout(self.default_navigation_timeout)
214+
if self.config.navigation_timeout is not None:
215+
context.set_default_navigation_timeout(self.config.navigation_timeout)
205216
self.context_wrappers[name] = BrowserContextWrapper(
206217
context=context,
207-
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
218+
semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context),
208219
persistent=persistent,
209220
)
210221
self._set_max_concurrent_context_count()
@@ -243,8 +254,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
243254
},
244255
)
245256
self._set_max_concurrent_page_count()
246-
if self.default_navigation_timeout is not None:
247-
page.set_default_navigation_timeout(self.default_navigation_timeout)
257+
if self.config.navigation_timeout is not None:
258+
page.set_default_navigation_timeout(self.config.navigation_timeout)
248259

249260
page.on("close", self._make_close_page_callback(context_name))
250261
page.on("crash", self._make_close_page_callback(context_name))
@@ -444,9 +455,9 @@ async def _handle_download(dwnld: Download) -> None:
444455
response = await page.goto(url=request.url, **page_goto_kwargs)
445456
except PlaywrightError as err:
446457
if not (
447-
self.browser_type_name in ("firefox", "webkit")
458+
self.config.browser_type_name in ("firefox", "webkit")
448459
and "Download is starting" in err.message
449-
or self.browser_type_name == "chromium"
460+
or self.config.browser_type_name == "chromium"
450461
and "net::ERR_ABORTED" in err.message
451462
):
452463
raise
@@ -480,7 +491,7 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider
480491
)
481492
else:
482493
pm.result = await _maybe_await(method(*pm.args, **pm.kwargs))
483-
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
494+
await page.wait_for_load_state(timeout=self.config.navigation_timeout)
484495
else:
485496
logger.warning(
486497
"Ignoring %r: expected PageMethod, got %r",
@@ -577,7 +588,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
577588
else:
578589
overrides["headers"] = final_headers = await _maybe_await(
579590
self.process_request_headers(
580-
self.browser_type_name, playwright_request, headers
591+
self.config.browser_type_name, playwright_request, headers
581592
)
582593
)
583594

tests/tests_asyncio/test_browser_contexts.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,6 @@
1515

1616

1717
class MixinTestCaseMultipleContexts:
18-
@pytest.mark.asyncio
19-
async def test_contexts_max_settings(self):
20-
settings = {
21-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
22-
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234,
23-
}
24-
async with make_handler(settings) as handler:
25-
assert handler.max_pages_per_context == 1234
26-
27-
settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "CONCURRENT_REQUESTS": 9876}
28-
async with make_handler(settings) as handler:
29-
assert handler.max_pages_per_context == 9876
30-
31-
settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": None}
32-
async with make_handler(settings) as handler:
33-
assert not hasattr(handler, "context_semaphore")
34-
35-
settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_MAX_CONTEXTS": 1234}
36-
async with make_handler(settings) as handler:
37-
assert handler.context_semaphore._value == 1234
38-
3918
@pytest.mark.asyncio
4019
async def test_context_kwargs(self):
4120
settings_dict = {

tests/tests_asyncio/test_playwright_requests.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -68,41 +68,6 @@ async def test_post_request(self):
6868
assert_correct_response(resp, req)
6969
assert "Request body: foo=bar" in resp.text
7070

71-
@pytest.mark.asyncio
72-
async def test_timeout_value(self):
73-
settings_dict = {
74-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
75-
}
76-
async with make_handler(settings_dict) as handler:
77-
assert handler.default_navigation_timeout is None
78-
79-
settings_dict = {
80-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
81-
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None,
82-
}
83-
async with make_handler(settings_dict) as handler:
84-
assert handler.default_navigation_timeout is None
85-
86-
settings_dict = {
87-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
88-
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0,
89-
}
90-
async with make_handler(settings_dict) as handler:
91-
assert handler.default_navigation_timeout == 0
92-
93-
settings_dict = {
94-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
95-
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123,
96-
}
97-
async with make_handler(settings_dict) as handler:
98-
assert handler.default_navigation_timeout == 123
99-
settings_dict = {
100-
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
101-
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5,
102-
}
103-
async with make_handler(settings_dict) as handler:
104-
assert handler.default_navigation_timeout == 0.5
105-
10671
@pytest.mark.asyncio
10772
async def test_timeout_error(self):
10873
settings_dict = {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from unittest import IsolatedAsyncioTestCase
2+
3+
from scrapy.settings import Settings
4+
5+
from scrapy_playwright.handler import Config
6+
7+
from tests import make_handler
8+
9+
10+
class TestSettings(IsolatedAsyncioTestCase):
11+
async def test_settings_timeout_value(self):
12+
config = Config.from_settings(Settings({}))
13+
assert config.navigation_timeout is None
14+
15+
config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None}))
16+
assert config.navigation_timeout is None
17+
18+
config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0}))
19+
assert config.navigation_timeout == 0
20+
21+
config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123}))
22+
assert config.navigation_timeout == 123
23+
24+
config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5}))
25+
assert config.navigation_timeout == 0.5
26+
27+
async def test_max_pages_per_context(self):
28+
config = Config.from_settings(Settings({"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234}))
29+
assert config.max_pages_per_context == 1234
30+
31+
config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876}))
32+
assert config.max_pages_per_context == 9876
33+
34+
async def test_max_contexts(self):
35+
async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler:
36+
assert not hasattr(handler, "context_semaphore")
37+
38+
async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": 1234}) as handler:
39+
assert handler.context_semaphore._value == 1234

tests/tests_asyncio/test_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from playwright.async_api import Error as PlaywrightError
77
from scrapy import Spider
88
from scrapy.http.headers import Headers
9+
910
from scrapy_playwright._utils import (
1011
_NAVIGATION_ERROR_MSG,
1112
_encode_body,

0 commit comments

Comments
 (0)