From fc0e1ce550a7c874dccf290301452eae93586348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 28 Nov 2019 15:34:59 +0100 Subject: [PATCH 1/2] Allow setting X-Crawlera-Session=_reuse --- docs/index.rst | 41 +++++ scrapy_crawlera/__init__.py | 1 + scrapy_crawlera/spidermiddlewares.py | 42 +++++ tests/test_spidermiddlewares_session.py | 211 ++++++++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 scrapy_crawlera/spidermiddlewares.py create mode 100644 tests/test_spidermiddlewares_session.py diff --git a/docs/index.rst b/docs/index.rst index 3dc1d51..96065bc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -89,6 +89,47 @@ requests with `DEFAULT_REQUEST_HEADERS `. + +Reusing sessions +================ + +To create a request in a callback and have that request reuse the same Crawlera +session as the callback response, you have to write something like:: + + def callback(self, response): + session = response.headers.get('X-Crawlera-Session') + # … + headers = {} + if session: + headers = {'X-Crawlera-Session': session} + yield Request(url, callback=self.callback, headers=headers) + +scrapy-crawlera provides an optional spider middleware that, if enabled, allows +using the `_reuse` value in the `X-Crawlera-Session` header to reuse the +Crawlera session from the source response:: + + def callback(self, response): + headers = {'X-Crawlera-Session': '_reuse'} + yield Request(url, callback=self.callback, headers=headers) + +To enable the Crawlera session reuse spider middleware, add it to your +``SPIDER_MIDDLEWARES`` setting:: + + SPIDER_MIDDLEWARES = { + 'scrapy_crawlera.CrawleraSessionReuseMiddleware': 1000, + } + +By default, ``CrawleraSessionReuseMiddleware`` removes ``X-Crawlera-Session`` +from the request headers if the source response did not use a Crawlera session, +or the source Crawlera session ID was bad. Use the +``CRAWLERA_SESSION_REUSE_DEFAULT_SESSION`` setting to set a fallback Crawlera +session value instead. For example, to create a new Crawlera session on +requests that come from responses without a Crawlera session or with a bad +Crawlera session ID:: + + CRAWLERA_SESSION_REUSE_DEFAULT_SESSION = 'create' + + All the rest ============ diff --git a/scrapy_crawlera/__init__.py b/scrapy_crawlera/__init__.py index 06bf154..f208fb6 100644 --- a/scrapy_crawlera/__init__.py +++ b/scrapy_crawlera/__init__.py @@ -1,4 +1,5 @@ from .middleware import CrawleraMiddleware +from .spidermiddlewares import CrawleraSessionReuseMiddleware __version__ = '1.6.0' diff --git a/scrapy_crawlera/spidermiddlewares.py b/scrapy_crawlera/spidermiddlewares.py new file mode 100644 index 0000000..e22d069 --- /dev/null +++ b/scrapy_crawlera/spidermiddlewares.py @@ -0,0 +1,42 @@ +from scrapy import Request + + +class CrawleraSessionReuseMiddleware(object): + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + setting = 'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION' + self._default_session = crawler.settings.get(setting) + + def process_spider_output(self, response, result, spider): + def _set_session(request_or_item): + if not isinstance(request_or_item, Request): + return request_or_item + + request = request_or_item + header = b'X-Crawlera-Session' + + if request.headers.get(header) != b'_reuse': + return request + + error = response.headers.get(b'X-Crawlera-Error') + if error == b'user_session_limit': + del request.headers[header] + return request + + session = response.headers.get(header) + session_is_bad = error == b'bad_session_id' + + if session is not None and not session_is_bad: + request.headers[header] = session + elif self._default_session: + request.headers[header] = self._default_session + else: + del request.headers[header] + return request + + return (_set_session(request_or_item) + for request_or_item in result or ()) diff --git a/tests/test_spidermiddlewares_session.py b/tests/test_spidermiddlewares_session.py new file mode 100644 index 0000000..ff3a416 --- /dev/null +++ b/tests/test_spidermiddlewares_session.py @@ -0,0 +1,211 @@ +import pytest +from scrapy import Spider as _Spider +from scrapy.http import Response, Request +from scrapy.item import Item +from scrapy.utils.reqser import request_to_dict +from scrapy.utils.test import get_crawler + +from scrapy_crawlera.spidermiddlewares import CrawleraSessionReuseMiddleware + + +SESSION = '1' + + +def compare_requests(request1, request2): + assert request_to_dict(request1) == request_to_dict(request2) + + +def process_output(response, result, settings=None): + crawler = get_crawler(Spider, settings) + mw = CrawleraSessionReuseMiddleware.from_crawler(crawler) + generator = mw.process_spider_output(response, [result], Spider()) + return list(generator)[0] + + +def get_request(reuse=False, session=None): + headers = {} + if reuse is True: + assert session is None + headers['X-Crawlera-Session'] = '_reuse' + elif session is not None: + headers['X-Crawlera-Session'] = session + return Request('https://example.com', headers=headers) + + +def get_response(session=None, error=None): + headers = {} + if session is not None: + headers['X-Crawlera-Session'] = session + if error is not None: + headers['X-Crawlera-Error'] = error + return Response('https://example.com', headers=headers) + + +class Spider(_Spider): + name = 'spider' + + +@pytest.mark.parametrize( + 'item', + [ + ( + {}, + ), + ( + Item(), + ), + ] +) +def test_item(item): + response = get_response(session=SESSION) + assert process_output(response, item) == item + + +def test_no_session(): + response = get_response() + input_request = get_request() + processed_request = process_output(response, input_request) + expected_request = get_request() + compare_requests(processed_request, expected_request) + + +def test_bad_session_id(): + response = get_response(session=SESSION, error='bad_session_id') + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request() + compare_requests(processed_request, expected_request) + + +def test_bad_session_id_default_session(): + response = get_response(session=SESSION, error='bad_session_id') + input_request = get_request(reuse=True) + settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} + processed_request = process_output(response, input_request, settings) + expected_request = get_request(session='create') + compare_requests(processed_request, expected_request) + + +def test_user_session_limit(): + # This session error is only expected to come from a response that has no + # ``X-Crawlera-Session`` value, caused by a request with ``create`` as + # ``X-Crawlera-Session`` value. + response = get_response(error='user_session_limit') + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request() + compare_requests(processed_request, expected_request) + + +@pytest.mark.parametrize( + 'error', + [ + # https://doc.scrapinghub.com/crawlera.html#errors + ( + 'bad_proxy_auth', + ), + ( + 'too_many_conns', + ), + ( + 'header_auth', + ), + ( + '', + ), + ( + 'nxdomain', + ), + ( + 'ehostunreach', + ), + ( + 'econnrefused', + ), + ( + 'econnreset', + ), + ( + 'socket_closed_remotely', + ), + ( + 'client_conn_closed', + ), + ( + 'noslaves', + ), + ( + 'banned', + ), + ( + 'serverbusy', + ), + ( + 'timeout', + ), + ( + 'msgtimeout', + ), + ( + 'domain_forbidden', + ), + ( + 'bad_header', + ), + ( + 'data_error', + ), + ] +) +def test_non_session_error(error): + session = SESSION + response = get_response(session=session, error=error) + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(session=SESSION) + compare_requests(processed_request, expected_request) + + +def test_session(): + session = SESSION + response = get_response(session=session) + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(session=SESSION) + compare_requests(processed_request, expected_request) + + +def test_create_on_sessionless_reuse(): + response = get_response() + input_request = get_request(reuse=True) + settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} + processed_request = process_output(response, input_request, settings) + expected_request = get_request(session='create') + compare_requests(processed_request, expected_request) + + +def test_dont_create_on_sessionless_reuse(): + response = get_response() + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request() + compare_requests(processed_request, expected_request) + + +@pytest.mark.parametrize( + 'session', + [ + ( + SESSION, + ), + ( + 'create', + ), + ] +) +def test_header_without_reuse(session): + response = get_response() + input_request = get_request(session=session) + processed_request = process_output(response, input_request) + expected_request = get_request(session=session) + compare_requests(processed_request, expected_request) From 39083f1bf1734de82b5616571427b4aa8cd8cb69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 3 Dec 2019 16:50:55 +0100 Subject: [PATCH 2/2] Switch Crawlera session reuse from a header value to a meta boolean flag --- docs/index.rst | 6 +++--- scrapy_crawlera/spidermiddlewares.py | 11 +++-------- tests/test_spidermiddlewares_session.py | 24 ++++++++++++------------ 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 96065bc..50593c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -105,12 +105,12 @@ session as the callback response, you have to write something like:: yield Request(url, callback=self.callback, headers=headers) scrapy-crawlera provides an optional spider middleware that, if enabled, allows -using the `_reuse` value in the `X-Crawlera-Session` header to reuse the +setting ``crawlera_session_reuse`` to ``True`` in your request to reuse the Crawlera session from the source response:: def callback(self, response): - headers = {'X-Crawlera-Session': '_reuse'} - yield Request(url, callback=self.callback, headers=headers) + meta = {'crawlera_session_reuse': True} + yield Request(url, callback=self.callback, meta=meta) To enable the Crawlera session reuse spider middleware, add it to your ``SPIDER_MIDDLEWARES`` setting:: diff --git a/scrapy_crawlera/spidermiddlewares.py b/scrapy_crawlera/spidermiddlewares.py index e22d069..c7e833c 100644 --- a/scrapy_crawlera/spidermiddlewares.py +++ b/scrapy_crawlera/spidermiddlewares.py @@ -18,24 +18,19 @@ def _set_session(request_or_item): request = request_or_item header = b'X-Crawlera-Session' + meta_key = 'crawlera_session_reuse' - if request.headers.get(header) != b'_reuse': - return request - - error = response.headers.get(b'X-Crawlera-Error') - if error == b'user_session_limit': - del request.headers[header] + if request.meta.get(meta_key) is not True: return request session = response.headers.get(header) + error = response.headers.get(b'X-Crawlera-Error') session_is_bad = error == b'bad_session_id' if session is not None and not session_is_bad: request.headers[header] = session elif self._default_session: request.headers[header] = self._default_session - else: - del request.headers[header] return request return (_set_session(request_or_item) diff --git a/tests/test_spidermiddlewares_session.py b/tests/test_spidermiddlewares_session.py index ff3a416..27fcd5e 100644 --- a/tests/test_spidermiddlewares_session.py +++ b/tests/test_spidermiddlewares_session.py @@ -24,12 +24,12 @@ def process_output(response, result, settings=None): def get_request(reuse=False, session=None): headers = {} - if reuse is True: - assert session is None - headers['X-Crawlera-Session'] = '_reuse' - elif session is not None: + if session is not None: headers['X-Crawlera-Session'] = session - return Request('https://example.com', headers=headers) + meta = {} + if reuse is True: + meta['crawlera_session_reuse'] = True + return Request('https://example.com', headers=headers, meta=meta) def get_response(session=None, error=None): @@ -73,7 +73,7 @@ def test_bad_session_id(): response = get_response(session=SESSION, error='bad_session_id') input_request = get_request(reuse=True) processed_request = process_output(response, input_request) - expected_request = get_request() + expected_request = get_request(reuse=True) compare_requests(processed_request, expected_request) @@ -82,7 +82,7 @@ def test_bad_session_id_default_session(): input_request = get_request(reuse=True) settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} processed_request = process_output(response, input_request, settings) - expected_request = get_request(session='create') + expected_request = get_request(reuse=True, session='create') compare_requests(processed_request, expected_request) @@ -93,7 +93,7 @@ def test_user_session_limit(): response = get_response(error='user_session_limit') input_request = get_request(reuse=True) processed_request = process_output(response, input_request) - expected_request = get_request() + expected_request = get_request(reuse=True) compare_requests(processed_request, expected_request) @@ -162,7 +162,7 @@ def test_non_session_error(error): response = get_response(session=session, error=error) input_request = get_request(reuse=True) processed_request = process_output(response, input_request) - expected_request = get_request(session=SESSION) + expected_request = get_request(reuse=True, session=SESSION) compare_requests(processed_request, expected_request) @@ -171,7 +171,7 @@ def test_session(): response = get_response(session=session) input_request = get_request(reuse=True) processed_request = process_output(response, input_request) - expected_request = get_request(session=SESSION) + expected_request = get_request(reuse=True, session=SESSION) compare_requests(processed_request, expected_request) @@ -180,7 +180,7 @@ def test_create_on_sessionless_reuse(): input_request = get_request(reuse=True) settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} processed_request = process_output(response, input_request, settings) - expected_request = get_request(session='create') + expected_request = get_request(reuse=True, session='create') compare_requests(processed_request, expected_request) @@ -188,7 +188,7 @@ def test_dont_create_on_sessionless_reuse(): response = get_response() input_request = get_request(reuse=True) processed_request = process_output(response, input_request) - expected_request = get_request() + expected_request = get_request(reuse=True) compare_requests(processed_request, expected_request)