Skip to content

Commit 613ee75

Browse files
committed
parse proxy response headers into response headers, version 0.1.3
1 parent bf4cb5d commit 613ee75

File tree

2 files changed

+35
-18
lines changed

2 files changed

+35
-18
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "scrapy-proxy-headers"
7-
version = "0.1.2"
7+
version = "0.1.3"
88
authors = [
99
{ name="ProxyMesh", email="support@proxymesh.com" },
1010
]

scrapy_proxy_headers/agent.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
from scrapy.core.downloader.handlers.http11 import TunnelingAgent, TunnelingTCP4ClientEndpoint, ScrapyAgent, HTTP11DownloadHandler
1+
import re
2+
from scrapy.core.downloader.handlers.http11 import TunnelingAgent, TunnelingTCP4ClientEndpoint, ScrapyAgent
23
from scrapy.core.downloader.webclient import _parse
34
from scrapy.utils.python import to_bytes
5+
from scrapy.http import Headers, Response
46

57
def tunnel_request_data_with_headers(host: str, port: int, **proxy_headers) -> bytes:
68
r"""
@@ -54,6 +56,19 @@ def requestTunnel(self, protocol):
5456
protocol.dataReceived = self.processProxyResponse # type: ignore[method-assign]
5557
self._protocol = protocol
5658
return protocol
59+
60+
def processProxyResponse(self, data: bytes):
61+
# data might have proxy headers, looks like
62+
# b'HTTP/1.1 200 Connection established\r\nProxy-Header: VALUE\r\n\r\n'
63+
response_headers = {}
64+
65+
for line in data.split(b'\r\n'):
66+
if b':' in line:
67+
key, val = line.split(b':', 1)
68+
response_headers[key.strip()] = val.strip()
69+
# save for endpoing & agent
70+
self._proxy_response_headers = Headers(response_headers)
71+
return super(TunnelingHeadersTCP4ClientEndpoint, self).processProxyResponse(data)
5772

5873
class TunnelingHeadersAgent(TunnelingAgent):
5974
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
@@ -70,7 +85,8 @@ def set_proxy_headers(self, proxy_headers):
7085
self._proxy_headers = proxy_headers
7186

7287
def _getEndpoint(self, uri):
73-
return TunnelingHeadersTCP4ClientEndpoint(
88+
# save endpoint for agent to get proxy_response_headers
89+
self._endpoint = TunnelingHeadersTCP4ClientEndpoint(
7490
reactor=self._reactor,
7591
host=uri.host,
7692
port=uri.port,
@@ -80,31 +96,32 @@ def _getEndpoint(self, uri):
8096
bindAddress=self._endpointFactory._bindAddress,
8197
**self._proxy_headers
8298
)
99+
return self._endpoint
83100

84101
class ScrapyProxyHeadersAgent(ScrapyAgent):
85102
_TunnelingAgent = TunnelingHeadersAgent
103+
104+
def __init__(self, *args, **kwargs):
105+
super().__init__(*args, **kwargs)
106+
self._agent = None
86107

87108
def _get_agent(self, request, timeout: float):
88-
agent = super()._get_agent(request, timeout)
109+
self._agent = super()._get_agent(request, timeout)
89110

90111
proxy = request.meta.get("proxy")
91112
proxy_headers = request.meta.get('proxy_headers')
92113
if proxy and proxy_headers:
93114
scheme = _parse(request.url)[0]
94115
if scheme == b"https":
95-
agent.set_proxy_headers(proxy_headers)
116+
self._agent.set_proxy_headers(proxy_headers)
96117

97-
return agent
118+
return self._agent
98119

99-
class HTTP11ProxyDownloadHandler(HTTP11DownloadHandler):
100-
def download_request(self, request, spider):
101-
"""Return a deferred for the HTTP download"""
102-
agent = ScrapyProxyHeadersAgent(
103-
contextFactory=self._contextFactory,
104-
pool=self._pool,
105-
maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
106-
warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
107-
fail_on_dataloss=self._fail_on_dataloss,
108-
crawler=self._crawler,
109-
)
110-
return agent.download_request(request)
120+
def _cb_bodydone(self, result, request, url: str):
121+
r = super()._cb_bodydone(result, request, url)
122+
if isinstance(r, Response):
123+
if self._agent and hasattr(self._agent, '_endpoint'):
124+
proxy_response_headers = getattr(self._agent._endpoint, '_proxy_response_headers')
125+
if proxy_response_headers:
126+
r.headers.update(proxy_response_headers)
127+
return r

0 commit comments

Comments
 (0)