1- from scrapy .core .downloader .handlers .http11 import TunnelingAgent , TunnelingTCP4ClientEndpoint , ScrapyAgent , HTTP11DownloadHandler
1+ import re
2+ from scrapy .core .downloader .handlers .http11 import TunnelingAgent , TunnelingTCP4ClientEndpoint , ScrapyAgent
23from scrapy .core .downloader .webclient import _parse
34from scrapy .utils .python import to_bytes
5+ from scrapy .http import Headers , Response
46
57def tunnel_request_data_with_headers (host : str , port : int , ** proxy_headers ) -> bytes :
68 r"""
@@ -54,6 +56,19 @@ def requestTunnel(self, protocol):
5456 protocol .dataReceived = self .processProxyResponse # type: ignore[method-assign]
5557 self ._protocol = protocol
5658 return protocol
59+
60+ def processProxyResponse (self , data : bytes ):
61+ # data might have proxy headers, looks like
62+ # b'HTTP/1.1 200 Connection established\r\nProxy-Header: VALUE\r\n\r\n'
63+ response_headers = {}
64+
65+ for line in data .split (b'\r \n ' ):
66+ if b':' in line :
67+ key , val = line .split (b':' , 1 )
68+ response_headers [key .strip ()] = val .strip ()
69+ # save for endpoing & agent
70+ self ._proxy_response_headers = Headers (response_headers )
71+ return super (TunnelingHeadersTCP4ClientEndpoint , self ).processProxyResponse (data )
5772
5873class TunnelingHeadersAgent (TunnelingAgent ):
5974 """An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
@@ -70,7 +85,8 @@ def set_proxy_headers(self, proxy_headers):
7085 self ._proxy_headers = proxy_headers
7186
7287 def _getEndpoint (self , uri ):
73- return TunnelingHeadersTCP4ClientEndpoint (
88+ # save endpoint for agent to get proxy_response_headers
89+ self ._endpoint = TunnelingHeadersTCP4ClientEndpoint (
7490 reactor = self ._reactor ,
7591 host = uri .host ,
7692 port = uri .port ,
@@ -80,31 +96,32 @@ def _getEndpoint(self, uri):
8096 bindAddress = self ._endpointFactory ._bindAddress ,
8197 ** self ._proxy_headers
8298 )
99+ return self ._endpoint
83100
84101class ScrapyProxyHeadersAgent (ScrapyAgent ):
85102 _TunnelingAgent = TunnelingHeadersAgent
103+
104+ def __init__ (self , * args , ** kwargs ):
105+ super ().__init__ (* args , ** kwargs )
106+ self ._agent = None
86107
87108 def _get_agent (self , request , timeout : float ):
88- agent = super ()._get_agent (request , timeout )
109+ self . _agent = super ()._get_agent (request , timeout )
89110
90111 proxy = request .meta .get ("proxy" )
91112 proxy_headers = request .meta .get ('proxy_headers' )
92113 if proxy and proxy_headers :
93114 scheme = _parse (request .url )[0 ]
94115 if scheme == b"https" :
95- agent .set_proxy_headers (proxy_headers )
116+ self . _agent .set_proxy_headers (proxy_headers )
96117
97- return agent
118+ return self . _agent
98119
99- class HTTP11ProxyDownloadHandler (HTTP11DownloadHandler ):
100- def download_request (self , request , spider ):
101- """Return a deferred for the HTTP download"""
102- agent = ScrapyProxyHeadersAgent (
103- contextFactory = self ._contextFactory ,
104- pool = self ._pool ,
105- maxsize = getattr (spider , "download_maxsize" , self ._default_maxsize ),
106- warnsize = getattr (spider , "download_warnsize" , self ._default_warnsize ),
107- fail_on_dataloss = self ._fail_on_dataloss ,
108- crawler = self ._crawler ,
109- )
110- return agent .download_request (request )
120+ def _cb_bodydone (self , result , request , url : str ):
121+ r = super ()._cb_bodydone (result , request , url )
122+ if isinstance (r , Response ):
123+ if self ._agent and hasattr (self ._agent , '_endpoint' ):
124+ proxy_response_headers = getattr (self ._agent ._endpoint , '_proxy_response_headers' )
125+ if proxy_response_headers :
126+ r .headers .update (proxy_response_headers )
127+ return r
0 commit comments