Skip to content

Commit 42fa592

Browse files
pawelmhmGallaecio
authored andcommitted
[tests] refactors tests, adds tests for spider attr
* refactors tests from functions to objects inheriting from unittest.TestCase * adds tests for enabling middleware with spider attribute
1 parent a736785 commit 42fa592

File tree

1 file changed

+142
-35
lines changed

1 file changed

+142
-35
lines changed

tests/test_middleware.py

Lines changed: 142 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import base64
66

77
import scrapy
8-
from scrapy.core.engine import ExecutionEngine
98
from scrapy.utils.test import get_crawler
109
from scrapy.http import Response, TextResponse
1110
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
@@ -699,44 +698,56 @@ def test_float_wait_arg():
699698
req = mw.process_request(req1, None)
700699
assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
701700

701+
def __init__(self, delay=0.0):
702+
self.delay = delay
702703

703704
def test_slot_policy_single_slot():
704705
mw = _get_mw()
705706
meta = {'splash': {
706707
'slot_policy': scrapy_splash.SlotPolicy.SINGLE_SLOT
707708
}}
708709

709-
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
710-
req1 = mw.process_request(req1, None)
710+
class MockedDownloader(object):
711711

712-
req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
713-
req2 = mw.process_request(req2, None)
712+
def __init__(self):
713+
self.slots = {}
714714

715-
assert req1.meta.get('download_slot')
716-
assert req1.meta['download_slot'] == req2.meta['download_slot']
715+
def _get_slot_key(self, request, spider):
716+
if 'download_slot' in request.meta:
717+
return request.meta['download_slot']
717718

719+
key = urlparse_cached(request).hostname or ''
720+
return key
718721

719722
def test_slot_policy_per_domain():
720723
mw = _get_mw()
721724
meta = {'splash': {
722725
'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN
723726
}}
724727

725-
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
726-
req1 = mw.process_request(req1, None)
728+
class MockedEngine(object):
729+
downloader = MockedDownloader()
727730

728-
req2 = scrapy.Request("http://example.com/path2", meta=meta)
729-
req2 = mw.process_request(req2, None)
730731

731-
req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
732-
req3 = mw.process_request(req3, None)
732+
class MiddlewareTest(unittest.TestCase):
733733

734-
assert req1.meta.get('download_slot')
735-
assert req3.meta.get('download_slot')
734+
def setUp(self):
735+
self.crawler = get_crawler(settings_dict={
736+
'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running
737+
})
738+
if not hasattr(self.crawler, 'logformatter'):
739+
self.crawler.logformatter = None
740+
self.crawler.engine = MockedEngine()
741+
self.mw = SplashMiddleware.from_crawler(self.crawler)
736742

737-
assert req1.meta['download_slot'] == req2.meta['download_slot']
738-
assert req1.meta['download_slot'] != req3.meta['download_slot']
743+
def test_nosplash(self):
744+
req = scrapy.Request("http://example.com")
745+
old_meta = copy.deepcopy(req.meta)
746+
assert self.mw.process_request(req, None) is None
747+
assert old_meta == req.meta
739748

749+
def test_splash_request(self):
750+
req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
740751

741752
def test_slot_policy_scrapy_default():
742753
mw = _get_mw()
@@ -746,22 +757,118 @@ def test_slot_policy_scrapy_default():
746757
req = mw.process_request(req, None)
747758
assert 'download_slot' not in req.meta
748759

760+
expected_body = {'url': req.url}
761+
expected_body.update(SplashRequest.default_splash_meta['args'])
762+
assert json.loads(req2.body) == expected_body
763+
764+
def test_splash_request_no_url(self):
765+
lua_source = "function main(splash) return {result='ok'} end"
766+
req1 = SplashRequest(meta={'splash': {
767+
'args': {'lua_source': lua_source},
768+
'endpoint': 'execute',
769+
}})
770+
req = self.mw.process_request(req1, None)
771+
assert req.url == 'http://127.0.0.1:8050/execute'
772+
assert json.loads(req.body) == {
773+
'url': 'about:blank',
774+
'lua_source': lua_source
775+
}
749776

750-
def test_adjust_timeout():
751-
mw = _get_mw()
752-
req1 = scrapy.Request("http://example.com", meta = {
753-
'splash': {'args': {'timeout': 60, 'html': 1}},
754-
755-
# download_timeout is always present,
756-
# it is set by DownloadTimeoutMiddleware
757-
'download_timeout': 30,
758-
})
759-
req1 = mw.process_request(req1, None)
760-
assert req1.meta['download_timeout'] > 60
761-
762-
req2 = scrapy.Request("http://example.com", meta = {
763-
'splash': {'args': {'html': 1}},
764-
'download_timeout': 30,
765-
})
766-
req2 = mw.process_request(req2, None)
767-
assert req2.meta['download_timeout'] == 30
777+
def test_override_splash_url(self):
778+
req1 = scrapy.Request("http://example.com", meta={
779+
'splash': {
780+
'endpoint': 'render.png',
781+
'splash_url': 'http://splash.example.com'
782+
}
783+
})
784+
req = self.mw.process_request(req1, None)
785+
assert req.url == 'http://splash.example.com/render.png'
786+
assert json.loads(req.body) == {'url': req1.url}
787+
788+
def test_float_wait_arg(self):
789+
req1 = scrapy.Request("http://example.com", meta={
790+
'splash': {
791+
'endpoint': 'render.html',
792+
'args': {'wait': 0.5}
793+
}
794+
})
795+
req = self.mw.process_request(req1, None)
796+
assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5}
797+
798+
def test_slot_policy_single_slot(self):
799+
meta = {'splash': {
800+
'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT
801+
}}
802+
803+
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
804+
req1 = self.mw.process_request(req1, None)
805+
806+
req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
807+
req2 = self.mw.process_request(req2, None)
808+
809+
assert req1.meta.get('download_slot')
810+
assert req1.meta['download_slot'] == req2.meta['download_slot']
811+
812+
def test_slot_policy_per_domain(self):
813+
meta = {'splash': {
814+
'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN
815+
}}
816+
817+
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
818+
req1 = self.mw.process_request(req1, None)
819+
820+
req2 = scrapy.Request("http://example.com/path2", meta=meta)
821+
req2 = self.mw.process_request(req2, None)
822+
823+
req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
824+
req3 = self.mw.process_request(req3, None)
825+
826+
assert req1.meta.get('download_slot')
827+
assert req3.meta.get('download_slot')
828+
829+
assert req1.meta['download_slot'] == req2.meta['download_slot']
830+
assert req1.meta['download_slot'] != req3.meta['download_slot']
831+
832+
def test_slot_policy_scrapy_default(self):
833+
req = scrapy.Request("http://example.com", meta = {'splash': {
834+
'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT
835+
}})
836+
req = self.mw.process_request(req, None)
837+
assert 'download_slot' not in req.meta
838+
839+
def test_adjust_timeout(self):
840+
req1 = scrapy.Request("http://example.com", meta = {
841+
'splash': {'args': {'timeout': 60, 'html': 1}},
842+
843+
# download_timeout is always present,
844+
# it is set by DownloadTimeoutMiddleware
845+
'download_timeout': 30,
846+
})
847+
req1 = self.mw.process_request(req1, None)
848+
assert req1.meta['download_timeout'] > 60
849+
850+
req2 = scrapy.Request("http://example.com", meta = {
851+
'splash': {'args': {'html': 1}},
852+
'download_timeout': 30,
853+
})
854+
req2 = self.mw.process_request(req2, None)
855+
assert req2.meta['download_timeout'] == 30
856+
857+
def test_spider_attribute(self):
858+
req_url = "http://scrapy.org"
859+
req1 = scrapy.Request(req_url)
860+
861+
spider = self.crawler._create_spider("foo")
862+
spider.splash = {"args": {"images": 0}}
863+
864+
req1 = self.mw.process_request(req1, spider)
865+
self.assertIn("_splash_processed", req1.meta)
866+
self.assertIn("render.json", req1.url)
867+
self.assertIn("url", json.loads(req1.body))
868+
self.assertEqual(json.loads(req1.body).get("url"), req_url)
869+
self.assertIn("images", json.loads(req1.body))
870+
871+
# spider attribute blank middleware disabled
872+
spider.splash = {}
873+
req2 = self.mw.process_request(req1, spider)
874+
self.assertIsNone(req2)

0 commit comments

Comments
 (0)