2323from scrapy .http import Request , Response
2424from scrapy .http .headers import Headers
2525from scrapy .responsetypes import responsetypes
26+ from scrapy .settings import Settings
2627from scrapy .utils .defer import deferred_from_coro
2728from scrapy .utils .misc import load_object
2829from scrapy .utils .reactor import verify_installed_reactor
@@ -60,57 +61,68 @@ class BrowserContextWrapper:
6061 persistent : bool
6162
6263
64+ @dataclass
65+ class Config :
66+ cdp_url : Optional [str ]
67+ cdp_kwargs : dict
68+ browser_type_name : str
69+ launch_options : dict
70+ max_pages_per_context : int
71+ max_contexts : Optional [int ]
72+ startup_context_kwargs : dict
73+ navigation_timeout : Optional [float ] = None
74+
75+ @classmethod
76+ def from_settings (cls , settings : Settings ) -> "Config" :
77+ cfg = cls (
78+ cdp_url = settings .get ("PLAYWRIGHT_CDP_URL" ),
79+ cdp_kwargs = settings .getdict ("PLAYWRIGHT_CDP_KWARGS" ) or {},
80+ browser_type_name = settings .get ("PLAYWRIGHT_BROWSER_TYPE" ) or DEFAULT_BROWSER_TYPE ,
81+ launch_options = settings .getdict ("PLAYWRIGHT_LAUNCH_OPTIONS" ) or {},
82+ max_pages_per_context = settings .getint ("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" ),
83+ max_contexts = settings .getint ("PLAYWRIGHT_MAX_CONTEXTS" ) or None ,
84+ startup_context_kwargs = settings .getdict ("PLAYWRIGHT_CONTEXTS" ),
85+ )
86+ cfg .cdp_kwargs .pop ("endpoint_url" , None )
87+ if not cfg .max_pages_per_context :
88+ cfg .max_pages_per_context = settings .getint ("CONCURRENT_REQUESTS" )
89+ if cfg .cdp_url and cfg .launch_options :
90+ logger .warning ("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS" )
91+ if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings :
92+ with suppress (TypeError , ValueError ):
93+ cfg .navigation_timeout = float (settings ["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" ])
94+ return cfg
95+
96+
6397class ScrapyPlaywrightDownloadHandler (HTTPDownloadHandler ):
6498 def __init__ (self , crawler : Crawler ) -> None :
65- settings = crawler .settings
66- super ().__init__ (settings = settings , crawler = crawler )
99+ super ().__init__ (settings = crawler .settings , crawler = crawler )
67100 verify_installed_reactor ("twisted.internet.asyncioreactor.AsyncioSelectorReactor" )
68101 crawler .signals .connect (self ._engine_started , signals .engine_started )
69102 self .stats = crawler .stats
70103
71- # browser
72- self .browser_cdp_url = settings .get ("PLAYWRIGHT_CDP_URL" )
73- self .browser_cdp_kwargs = settings .get ("PLAYWRIGHT_CDP_KWARGS" ) or {}
74- self .browser_cdp_kwargs .pop ("endpoint_url" , None )
75- self .browser_type_name = settings .get ("PLAYWRIGHT_BROWSER_TYPE" ) or DEFAULT_BROWSER_TYPE
76- self .browser_launch_lock = asyncio .Lock ()
77- self .launch_options : dict = settings .getdict ("PLAYWRIGHT_LAUNCH_OPTIONS" ) or {}
78- if self .browser_cdp_url and self .launch_options :
79- logger .warning ("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS" )
104+ self .config = Config .from_settings (crawler .settings )
80105
81- # contexts
82- self .max_pages_per_context : int = settings .getint (
83- "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
84- ) or settings .getint ("CONCURRENT_REQUESTS" )
106+ self .browser_launch_lock = asyncio .Lock ()
85107 self .context_launch_lock = asyncio .Lock ()
86108 self .context_wrappers : Dict [str , BrowserContextWrapper ] = {}
87- self .startup_context_kwargs : dict = settings .getdict ("PLAYWRIGHT_CONTEXTS" )
88- if settings .getint ("PLAYWRIGHT_MAX_CONTEXTS" ):
89- self .context_semaphore = asyncio .Semaphore (
90- value = settings .getint ("PLAYWRIGHT_MAX_CONTEXTS" )
91- )
92-
93- self .default_navigation_timeout : Optional [float ] = None
94- if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings :
95- with suppress (TypeError , ValueError ):
96- self .default_navigation_timeout = float (
97- settings .get ("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" )
98- )
109+ if self .config .max_contexts :
110+ self .context_semaphore = asyncio .Semaphore (value = self .config .max_contexts )
99111
100112 # headers
101- if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings :
102- if settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ] is None :
113+ if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler . settings :
114+ if crawler . settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ] is None :
103115 self .process_request_headers = None
104116 else :
105117 self .process_request_headers = load_object (
106- settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ]
118+ crawler . settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ]
107119 )
108120 else :
109121 self .process_request_headers = use_scrapy_headers
110122
111123 self .abort_request : Optional [Callable [[PlaywrightRequest ], Union [Awaitable , bool ]]] = None
112- if settings .get ("PLAYWRIGHT_ABORT_REQUEST" ):
113- self .abort_request = load_object (settings ["PLAYWRIGHT_ABORT_REQUEST" ])
124+ if crawler . settings .get ("PLAYWRIGHT_ABORT_REQUEST" ):
125+ self .abort_request = load_object (crawler . settings ["PLAYWRIGHT_ABORT_REQUEST" ])
114126
115127 @classmethod
116128 def from_crawler (cls : Type [PlaywrightHandler ], crawler : Crawler ) -> PlaywrightHandler :
@@ -125,35 +137,34 @@ async def _launch(self) -> None:
125137 logger .info ("Starting download handler" )
126138 self .playwright_context_manager = PlaywrightContextManager ()
127139 self .playwright = await self .playwright_context_manager .start ()
128- self .browser_type : BrowserType = getattr (self .playwright , self .browser_type_name )
129- if self .startup_context_kwargs :
130- logger .info ("Launching %i startup context(s)" , len (self .startup_context_kwargs ))
140+ self .browser_type : BrowserType = getattr (self .playwright , self .config . browser_type_name )
141+ if self .config . startup_context_kwargs :
142+ logger .info ("Launching %i startup context(s)" , len (self .config . startup_context_kwargs ))
131143 await asyncio .gather (
132144 * [
133145 self ._create_browser_context (name = name , context_kwargs = kwargs )
134- for name , kwargs in self .startup_context_kwargs .items ()
146+ for name , kwargs in self .config . startup_context_kwargs .items ()
135147 ]
136148 )
137149 self ._set_max_concurrent_context_count ()
138150 logger .info ("Startup context(s) launched" )
139151 self .stats .set_value ("playwright/page_count" , self ._get_total_page_count ())
140- del self .startup_context_kwargs
141152
142153 async def _maybe_launch_browser (self ) -> None :
143154 async with self .browser_launch_lock :
144155 if not hasattr (self , "browser" ):
145156 logger .info ("Launching browser %s" , self .browser_type .name )
146- self .browser = await self .browser_type .launch (** self .launch_options )
157+ self .browser = await self .browser_type .launch (** self .config . launch_options )
147158 logger .info ("Browser %s launched" , self .browser_type .name )
148159
149160 async def _maybe_connect_devtools (self ) -> None :
150161 async with self .browser_launch_lock :
151162 if not hasattr (self , "browser" ):
152- logger .info ("Connecting using CDP: %s" , self .browser_cdp_url )
163+ logger .info ("Connecting using CDP: %s" , self .config . cdp_url )
153164 self .browser = await self .browser_type .connect_over_cdp (
154- self .browser_cdp_url , ** self .browser_cdp_kwargs
165+ self .config . cdp_url , ** self .config . cdp_kwargs
155166 )
156- logger .info ("Connected using CDP: %s" , self .browser_cdp_url )
167+ logger .info ("Connected using CDP: %s" , self .config . cdp_url )
157168
158169 async def _create_browser_context (
159170 self ,
@@ -171,7 +182,7 @@ async def _create_browser_context(
171182 context = await self .browser_type .launch_persistent_context (** context_kwargs )
172183 persistent = True
173184 remote = False
174- elif self .browser_cdp_url :
185+ elif self .config . cdp_url :
175186 await self ._maybe_connect_devtools ()
176187 context = await self .browser .new_context (** context_kwargs )
177188 persistent = False
@@ -200,11 +211,11 @@ async def _create_browser_context(
200211 "remote" : remote ,
201212 },
202213 )
203- if self .default_navigation_timeout is not None :
204- context .set_default_navigation_timeout (self .default_navigation_timeout )
214+ if self .config . navigation_timeout is not None :
215+ context .set_default_navigation_timeout (self .config . navigation_timeout )
205216 self .context_wrappers [name ] = BrowserContextWrapper (
206217 context = context ,
207- semaphore = asyncio .Semaphore (value = self .max_pages_per_context ),
218+ semaphore = asyncio .Semaphore (value = self .config . max_pages_per_context ),
208219 persistent = persistent ,
209220 )
210221 self ._set_max_concurrent_context_count ()
@@ -243,8 +254,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
243254 },
244255 )
245256 self ._set_max_concurrent_page_count ()
246- if self .default_navigation_timeout is not None :
247- page .set_default_navigation_timeout (self .default_navigation_timeout )
257+ if self .config . navigation_timeout is not None :
258+ page .set_default_navigation_timeout (self .config . navigation_timeout )
248259
249260 page .on ("close" , self ._make_close_page_callback (context_name ))
250261 page .on ("crash" , self ._make_close_page_callback (context_name ))
@@ -444,9 +455,9 @@ async def _handle_download(dwnld: Download) -> None:
444455 response = await page .goto (url = request .url , ** page_goto_kwargs )
445456 except PlaywrightError as err :
446457 if not (
447- self .browser_type_name in ("firefox" , "webkit" )
458+ self .config . browser_type_name in ("firefox" , "webkit" )
448459 and "Download is starting" in err .message
449- or self .browser_type_name == "chromium"
460+ or self .config . browser_type_name == "chromium"
450461 and "net::ERR_ABORTED" in err .message
451462 ):
452463 raise
@@ -480,7 +491,7 @@ async def _apply_page_methods(self, page: Page, request: Request, spider: Spider
480491 )
481492 else :
482493 pm .result = await _maybe_await (method (* pm .args , ** pm .kwargs ))
483- await page .wait_for_load_state (timeout = self .default_navigation_timeout )
494+ await page .wait_for_load_state (timeout = self .config . navigation_timeout )
484495 else :
485496 logger .warning (
486497 "Ignoring %r: expected PageMethod, got %r" ,
@@ -577,7 +588,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
577588 else :
578589 overrides ["headers" ] = final_headers = await _maybe_await (
579590 self .process_request_headers (
580- self .browser_type_name , playwright_request , headers
591+ self .config . browser_type_name , playwright_request , headers
581592 )
582593 )
583594
0 commit comments