22import logging
33import warnings
44from contextlib import suppress
5+ from dataclasses import dataclass
56from ipaddress import ip_address
67from time import time
78from typing import Awaitable , Callable , Dict , Generator , Optional , Tuple , Type , TypeVar , Union
89
910from playwright .async_api import (
11+ Browser ,
1012 BrowserContext ,
13+ BrowserType ,
1114 Page ,
1215 PlaywrightContextManager ,
1316 Request as PlaywrightRequest ,
4144logger = logging .getLogger ("scrapy-playwright" )
4245
4346
47+ DEFAULT_BROWSER_TYPE = "chromium"
48+ DEFAULT_CONTEXT_NAME = "default"
49+ PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir"
50+
51+
52+ @dataclass
53+ class BrowserContextWrapper :
54+ context : BrowserContext
55+ semaphore : asyncio .Semaphore
56+ persistent : bool
57+
58+
4459class ScrapyPlaywrightDownloadHandler (HTTPDownloadHandler ):
4560 def __init__ (self , crawler : Crawler ) -> None :
46- super ().__init__ (settings = crawler .settings , crawler = crawler )
61+ settings = crawler .settings
62+ super ().__init__ (settings = settings , crawler = crawler )
4763 verify_installed_reactor ("twisted.internet.asyncioreactor.AsyncioSelectorReactor" )
4864 crawler .signals .connect (self ._engine_started , signals .engine_started )
4965 self .stats = crawler .stats
5066
51- self .browser_type : str = crawler .settings .get ("PLAYWRIGHT_BROWSER_TYPE" ) or "chromium"
52- self .max_pages_per_context : int = crawler .settings .getint (
67+ self .browser_launch_lock = asyncio .Lock ()
68+ self .context_launch_lock = asyncio .Lock ()
69+ self .browser_type_name = settings .get ("PLAYWRIGHT_BROWSER_TYPE" ) or DEFAULT_BROWSER_TYPE
70+ self .max_pages_per_context : int = settings .getint (
5371 "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
54- ) or crawler . settings .getint ("CONCURRENT_REQUESTS" )
55- self .launch_options : dict = crawler . settings .getdict ("PLAYWRIGHT_LAUNCH_OPTIONS" ) or {}
72+ ) or settings .getint ("CONCURRENT_REQUESTS" )
73+ self .launch_options : dict = settings .getdict ("PLAYWRIGHT_LAUNCH_OPTIONS" ) or {}
5674
5775 self .default_navigation_timeout : Optional [float ] = None
58- if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler . settings :
76+ if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings :
5977 with suppress (TypeError , ValueError ):
6078 self .default_navigation_timeout = float (
61- crawler . settings .get ("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" )
79+ settings .get ("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" )
6280 )
6381
64- if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler .settings :
65- if crawler .settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ] is None :
66- self .process_request_headers = None # use headers from the Playwright request
82+ # header-related settings
83+ if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings :
84+ if settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ] is None :
85+ self .process_request_headers = None
6786 else :
6887 self .process_request_headers = load_object (
69- crawler . settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ]
88+ settings ["PLAYWRIGHT_PROCESS_REQUEST_HEADERS" ]
7089 )
7190 if self .process_request_headers is use_playwright_headers :
7291 warnings .warn (
@@ -80,70 +99,94 @@ def __init__(self, crawler: Crawler) -> None:
8099 else :
81100 self .process_request_headers = use_scrapy_headers
82101
83- self . context_kwargs : dict = crawler . settings . getdict ( "PLAYWRIGHT_CONTEXTS" )
84- self .contexts : Dict [str , BrowserContext ] = {}
85- self .context_semaphores : Dict [ str , asyncio . Semaphore ] = {}
102+ # context-related settings
103+ self .contexts : Dict [str , BrowserContextWrapper ] = {}
104+ self .context_kwargs : dict = settings . getdict ( "PLAYWRIGHT_CONTEXTS" )
86105
87106 self .abort_request : Optional [Callable [[PlaywrightRequest ], Union [Awaitable , bool ]]] = None
88- if crawler . settings .get ("PLAYWRIGHT_ABORT_REQUEST" ):
89- self .abort_request = load_object (crawler . settings ["PLAYWRIGHT_ABORT_REQUEST" ])
107+ if settings .get ("PLAYWRIGHT_ABORT_REQUEST" ):
108+ self .abort_request = load_object (settings ["PLAYWRIGHT_ABORT_REQUEST" ])
90109
91110 @classmethod
92111 def from_crawler (cls : Type [PlaywrightHandler ], crawler : Crawler ) -> PlaywrightHandler :
93112 return cls (crawler )
94113
95114 def _engine_started (self ) -> Deferred :
96115 """Launch the browser. Use the engine_started signal as it supports returning deferreds."""
97- return deferred_from_coro (self ._launch_browser ())
116+ return deferred_from_coro (self ._launch ())
98117
99- async def _launch_browser (self ) -> None :
118+ async def _launch (self ) -> None :
119+ """Launch Playwright manager and configured startup context(s)."""
120+ logger .info ("Starting download handler" )
100121 self .playwright_context_manager = PlaywrightContextManager ()
101122 self .playwright = await self .playwright_context_manager .start ()
102- logger .info ("Launching browser" )
103- browser_launcher = getattr (self .playwright , self .browser_type ).launch
104- self .browser = await browser_launcher (** self .launch_options )
105- logger .info (f"Browser { self .browser_type } launched" )
106- contexts = await asyncio .gather (
107- * [
108- self ._create_browser_context (name , kwargs )
109- for name , kwargs in self .context_kwargs .items ()
110- ]
111- )
112- self .contexts = dict (zip (self .context_kwargs .keys (), contexts ))
113- self .context_semaphores = {
114- name : asyncio .Semaphore (value = self .max_pages_per_context ) for name in self .contexts
115- }
116-
117- async def _create_browser_context (self , name : str , context_kwargs : dict ) -> BrowserContext :
118- context = await self .browser .new_context (** context_kwargs )
119- context .on ("close" , self ._make_close_browser_context_callback (name ))
120- logger .debug ("Browser context started: '%s'" , name )
123+ self .browser_type : BrowserType = getattr (self .playwright , self .browser_type_name )
124+ if self .context_kwargs :
125+ logger .info (f"Launching { len (self .context_kwargs )} startup context(s)" )
126+ contexts = await asyncio .gather (
127+ * [
128+ self ._create_browser_context (name = name , context_kwargs = kwargs )
129+ for name , kwargs in self .context_kwargs .items ()
130+ ]
131+ )
132+ self .contexts = dict (zip (self .context_kwargs .keys (), contexts ))
133+ logger .info ("Startup context(s) launched" )
134+ self .stats .set_value ("playwright/page_count" , self ._get_total_page_count ())
135+
136+ async def _maybe_launch_browser (self ) -> None :
137+ async with self .browser_launch_lock :
138+ if not hasattr (self , "browser" ):
139+ logger .info (f"Launching browser { self .browser_type .name } " )
140+ self .browser : Browser = await self .browser_type .launch (** self .launch_options )
141+ logger .info (f"Browser { self .browser_type .name } launched" )
142+
143+ async def _create_browser_context (
144+ self , name : str , context_kwargs : Optional [dict ]
145+ ) -> BrowserContextWrapper :
146+ """Create a new context, also launching a browser if necessary."""
147+ context_kwargs = context_kwargs or {}
148+ if context_kwargs .get (PERSISTENT_CONTEXT_PATH_KEY ):
149+ context = await self .browser_type .launch_persistent_context (** context_kwargs )
150+ persistent = True
151+ self .stats .inc_value ("playwright/context_count/persistent" )
152+ else :
153+ await self ._maybe_launch_browser ()
154+ context = await self .browser .new_context (** context_kwargs )
155+ persistent = False
156+ self .stats .inc_value ("playwright/context_count/non-persistent" )
157+ context .on ("close" , self ._make_close_browser_context_callback (name , persistent ))
158+ logger .debug (f"Browser context started: '{ name } ' (persistent={ persistent } )" )
121159 self .stats .inc_value ("playwright/context_count" )
122160 if self .default_navigation_timeout is not None :
123161 context .set_default_navigation_timeout (self .default_navigation_timeout )
124- return context
162+ return BrowserContextWrapper (
163+ context = context ,
164+ semaphore = asyncio .Semaphore (value = self .max_pages_per_context ),
165+ persistent = persistent ,
166+ )
125167
126168 async def _create_page (self , request : Request ) -> Page :
127169 """Create a new page in a context, also creating a new context if necessary."""
128- context_name = request .meta .setdefault ("playwright_context" , "default" )
129- context = self . contexts . get ( context_name )
130- if context is None :
131- context_kwargs = request . meta . get ( "playwright_context_kwargs" ) or {}
132- context = await self ._create_browser_context (context_name , context_kwargs )
133- self . contexts [ context_name ] = context
134- self .context_semaphores [context_name ] = asyncio . Semaphore (
135- value = self . max_pages_per_context
136- )
170+ context_name = request .meta .setdefault ("playwright_context" , DEFAULT_CONTEXT_NAME )
171+ # this block needs to be locked because several attempts to launch a context
172+ # with the same name could happen at the same time from different requests
173+ async with self . context_launch_lock :
174+ context = self .contexts . get (context_name )
175+ if context is None :
176+ context = self .contexts [context_name ] = await self . _create_browser_context (
177+ name = context_name , context_kwargs = request . meta . get ( "playwright_context_kwargs" )
178+ )
137179
138- await self . context_semaphores [ context_name ] .acquire ()
139- page = await context .new_page ()
180+ await context . semaphore .acquire ()
181+ page = await context .context . new_page ()
140182 self .stats .inc_value ("playwright/page_count" )
141183 logger .debug (
142184 "[Context=%s] New page created, page count is %i (%i for all contexts)" ,
143185 context_name ,
144- len (context .pages ),
186+ len (context .context . pages ),
145187 self ._get_total_page_count (),
146188 )
189+ self ._set_max_concurrent_page_count ()
147190 if self .default_navigation_timeout is not None :
148191 page .set_default_navigation_timeout (self .default_navigation_timeout )
149192
@@ -157,21 +200,24 @@ async def _create_page(self, request: Request) -> Page:
157200 return page
158201
159202 def _get_total_page_count (self ):
160- count = sum ([len (context .pages ) for context in self .contexts .values ()])
203+ return sum ([len (ctx .context .pages ) for ctx in self .contexts .values ()])
204+
205+ def _set_max_concurrent_page_count (self ):
206+ count = self ._get_total_page_count ()
161207 current_max_count = self .stats .get_value ("playwright/page_count/max_concurrent" )
162208 if current_max_count is None or count > current_max_count :
163209 self .stats .set_value ("playwright/page_count/max_concurrent" , count )
164- return count
165210
166211 @inlineCallbacks
167212 def close (self ) -> Deferred :
213+ logger .info ("Closing download handler" )
168214 yield super ().close ()
169215 yield deferred_from_coro (self ._close ())
170216
171217 async def _close (self ) -> None :
218+ await asyncio .gather (* [ctx .context .close () for ctx in self .contexts .values ()])
172219 self .contexts .clear ()
173- self .context_semaphores .clear ()
174- if getattr (self , "browser" , None ):
220+ if hasattr (self , "browser" ):
175221 logger .info ("Closing browser" )
176222 await self .browser .close ()
177223 await self .playwright_context_manager .__aexit__ ()
@@ -302,18 +348,16 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
302348
303349 def _make_close_page_callback (self , context_name : str ) -> Callable :
304350 def close_page_callback () -> None :
305- if context_name in self .context_semaphores :
306- self .context_semaphores [context_name ].release ()
351+ if context_name in self .contexts :
352+ self .contexts [context_name ]. semaphore .release ()
307353
308354 return close_page_callback
309355
310- def _make_close_browser_context_callback (self , name : str ) -> Callable :
356+ def _make_close_browser_context_callback (self , name : str , persistent : bool ) -> Callable :
311357 def close_browser_context_callback () -> None :
312- logger .debug ("Browser context closed: '%s'" , name )
358+ logger .debug (f "Browser context closed: '{ name } ' (persistent= { persistent } )" )
313359 if name in self .contexts :
314360 self .contexts .pop (name )
315- if name in self .context_semaphores :
316- self .context_semaphores .pop (name )
317361
318362 return close_browser_context_callback
319363
@@ -334,7 +378,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
334378 if self .process_request_headers is not None :
335379 overrides ["headers" ] = await _maybe_await (
336380 self .process_request_headers (
337- self .browser_type , playwright_request , scrapy_headers
381+ self .browser_type_name , playwright_request , scrapy_headers
338382 )
339383 )
340384 # the request that reaches the callback should contain the final headers
0 commit comments