55import base64
66
77import scrapy
8- from scrapy .core .engine import ExecutionEngine
98from scrapy .utils .test import get_crawler
109from scrapy .http import Response , TextResponse
1110from scrapy .downloadermiddlewares .httpcache import HttpCacheMiddleware
@@ -699,44 +698,56 @@ def test_float_wait_arg():
699698 req = mw .process_request (req1 , None )
700699 assert json .loads (to_native_str (req .body )) == {'url' : req1 .url , 'wait' : 0.5 }
701700
701+ def __init__ (self , delay = 0.0 ):
702+ self .delay = delay
702703
703704def test_slot_policy_single_slot ():
704705 mw = _get_mw ()
705706 meta = {'splash' : {
706707 'slot_policy' : scrapy_splash .SlotPolicy .SINGLE_SLOT
707708 }}
708709
709- req1 = scrapy .Request ("http://example.com/path?key=value" , meta = meta )
710- req1 = mw .process_request (req1 , None )
710+ class MockedDownloader (object ):
711711
712- req2 = scrapy . Request ( "http://fooexample.com/path?key=value" , meta = meta )
713- req2 = mw . process_request ( req2 , None )
712+ def __init__ ( self ):
713+ self . slots = {}
714714
715- assert req1 .meta .get ('download_slot' )
716- assert req1 .meta ['download_slot' ] == req2 .meta ['download_slot' ]
715+ def _get_slot_key (self , request , spider ):
716+ if 'download_slot' in request .meta :
717+ return request .meta ['download_slot' ]
717718
719+ key = urlparse_cached (request ).hostname or ''
720+ return key
718721
719722def test_slot_policy_per_domain ():
720723 mw = _get_mw ()
721724 meta = {'splash' : {
722725 'slot_policy' : scrapy_splash .SlotPolicy .PER_DOMAIN
723726 }}
724727
725- req1 = scrapy . Request ( "http://example.com/path?key=value" , meta = meta )
726- req1 = mw . process_request ( req1 , None )
728+ class MockedEngine ( object ):
729+ downloader = MockedDownloader ( )
727730
728- req2 = scrapy .Request ("http://example.com/path2" , meta = meta )
729- req2 = mw .process_request (req2 , None )
730731
731- req3 = scrapy .Request ("http://fooexample.com/path?key=value" , meta = meta )
732- req3 = mw .process_request (req3 , None )
732+ class MiddlewareTest (unittest .TestCase ):
733733
734- assert req1 .meta .get ('download_slot' )
735- assert req3 .meta .get ('download_slot' )
734+ def setUp (self ):
735+ self .crawler = get_crawler (settings_dict = {
736+ 'DOWNLOAD_HANDLERS' : {'s3' : None }, # for faster test running
737+ })
738+ if not hasattr (self .crawler , 'logformatter' ):
739+ self .crawler .logformatter = None
740+ self .crawler .engine = MockedEngine ()
741+ self .mw = SplashMiddleware .from_crawler (self .crawler )
736742
737- assert req1 .meta ['download_slot' ] == req2 .meta ['download_slot' ]
738- assert req1 .meta ['download_slot' ] != req3 .meta ['download_slot' ]
743+ def test_nosplash (self ):
744+ req = scrapy .Request ("http://example.com" )
745+ old_meta = copy .deepcopy (req .meta )
746+ assert self .mw .process_request (req , None ) is None
747+ assert old_meta == req .meta
739748
749+ def test_splash_request (self ):
750+ req = SplashRequest ("http://example.com?foo=bar&url=1&wait=100" )
740751
741752def test_slot_policy_scrapy_default ():
742753 mw = _get_mw ()
@@ -746,22 +757,118 @@ def test_slot_policy_scrapy_default():
746757 req = mw .process_request (req , None )
747758 assert 'download_slot' not in req .meta
748759
760+ expected_body = {'url' : req .url }
761+ expected_body .update (SplashRequest .default_splash_meta ['args' ])
762+ assert json .loads (req2 .body ) == expected_body
763+
764+ def test_splash_request_no_url (self ):
765+ lua_source = "function main(splash) return {result='ok'} end"
766+ req1 = SplashRequest (meta = {'splash' : {
767+ 'args' : {'lua_source' : lua_source },
768+ 'endpoint' : 'execute' ,
769+ }})
770+ req = self .mw .process_request (req1 , None )
771+ assert req .url == 'http://127.0.0.1:8050/execute'
772+ assert json .loads (req .body ) == {
773+ 'url' : 'about:blank' ,
774+ 'lua_source' : lua_source
775+ }
749776
750- def test_adjust_timeout ():
751- mw = _get_mw ()
752- req1 = scrapy .Request ("http://example.com" , meta = {
753- 'splash' : {'args' : {'timeout' : 60 , 'html' : 1 }},
754-
755- # download_timeout is always present,
756- # it is set by DownloadTimeoutMiddleware
757- 'download_timeout' : 30 ,
758- })
759- req1 = mw .process_request (req1 , None )
760- assert req1 .meta ['download_timeout' ] > 60
761-
762- req2 = scrapy .Request ("http://example.com" , meta = {
763- 'splash' : {'args' : {'html' : 1 }},
764- 'download_timeout' : 30 ,
765- })
766- req2 = mw .process_request (req2 , None )
767- assert req2 .meta ['download_timeout' ] == 30
777+ def test_override_splash_url (self ):
778+ req1 = scrapy .Request ("http://example.com" , meta = {
779+ 'splash' : {
780+ 'endpoint' : 'render.png' ,
781+ 'splash_url' : 'http://splash.example.com'
782+ }
783+ })
784+ req = self .mw .process_request (req1 , None )
785+ assert req .url == 'http://splash.example.com/render.png'
786+ assert json .loads (req .body ) == {'url' : req1 .url }
787+
788+ def test_float_wait_arg (self ):
789+ req1 = scrapy .Request ("http://example.com" , meta = {
790+ 'splash' : {
791+ 'endpoint' : 'render.html' ,
792+ 'args' : {'wait' : 0.5 }
793+ }
794+ })
795+ req = self .mw .process_request (req1 , None )
796+ assert json .loads (req .body ) == {'url' : req1 .url , 'wait' : 0.5 }
797+
798+ def test_slot_policy_single_slot (self ):
799+ meta = {'splash' : {
800+ 'slot_policy' : scrapyjs .SlotPolicy .SINGLE_SLOT
801+ }}
802+
803+ req1 = scrapy .Request ("http://example.com/path?key=value" , meta = meta )
804+ req1 = self .mw .process_request (req1 , None )
805+
806+ req2 = scrapy .Request ("http://fooexample.com/path?key=value" , meta = meta )
807+ req2 = self .mw .process_request (req2 , None )
808+
809+ assert req1 .meta .get ('download_slot' )
810+ assert req1 .meta ['download_slot' ] == req2 .meta ['download_slot' ]
811+
812+ def test_slot_policy_per_domain (self ):
813+ meta = {'splash' : {
814+ 'slot_policy' : scrapyjs .SlotPolicy .PER_DOMAIN
815+ }}
816+
817+ req1 = scrapy .Request ("http://example.com/path?key=value" , meta = meta )
818+ req1 = self .mw .process_request (req1 , None )
819+
820+ req2 = scrapy .Request ("http://example.com/path2" , meta = meta )
821+ req2 = self .mw .process_request (req2 , None )
822+
823+ req3 = scrapy .Request ("http://fooexample.com/path?key=value" , meta = meta )
824+ req3 = self .mw .process_request (req3 , None )
825+
826+ assert req1 .meta .get ('download_slot' )
827+ assert req3 .meta .get ('download_slot' )
828+
829+ assert req1 .meta ['download_slot' ] == req2 .meta ['download_slot' ]
830+ assert req1 .meta ['download_slot' ] != req3 .meta ['download_slot' ]
831+
832+ def test_slot_policy_scrapy_default (self ):
833+ req = scrapy .Request ("http://example.com" , meta = {'splash' : {
834+ 'slot_policy' : scrapyjs .SlotPolicy .SCRAPY_DEFAULT
835+ }})
836+ req = self .mw .process_request (req , None )
837+ assert 'download_slot' not in req .meta
838+
839+ def test_adjust_timeout (self ):
840+ req1 = scrapy .Request ("http://example.com" , meta = {
841+ 'splash' : {'args' : {'timeout' : 60 , 'html' : 1 }},
842+
843+ # download_timeout is always present,
844+ # it is set by DownloadTimeoutMiddleware
845+ 'download_timeout' : 30 ,
846+ })
847+ req1 = self .mw .process_request (req1 , None )
848+ assert req1 .meta ['download_timeout' ] > 60
849+
850+ req2 = scrapy .Request ("http://example.com" , meta = {
851+ 'splash' : {'args' : {'html' : 1 }},
852+ 'download_timeout' : 30 ,
853+ })
854+ req2 = self .mw .process_request (req2 , None )
855+ assert req2 .meta ['download_timeout' ] == 30
856+
857+ def test_spider_attribute (self ):
858+ req_url = "http://scrapy.org"
859+ req1 = scrapy .Request (req_url )
860+
861+ spider = self .crawler ._create_spider ("foo" )
862+ spider .splash = {"args" : {"images" : 0 }}
863+
864+ req1 = self .mw .process_request (req1 , spider )
865+ self .assertIn ("_splash_processed" , req1 .meta )
866+ self .assertIn ("render.json" , req1 .url )
867+ self .assertIn ("url" , json .loads (req1 .body ))
868+ self .assertEqual (json .loads (req1 .body ).get ("url" ), req_url )
869+ self .assertIn ("images" , json .loads (req1 .body ))
870+
871+ # spider attribute blank middleware disabled
872+ spider .splash = {}
873+ req2 = self .mw .process_request (req1 , spider )
874+ self .assertIsNone (req2 )
0 commit comments