1010.. moduleauthor:: Olaf Conradi <olaf@conradi.org>
1111"""
1212import string
13+ import re
1314import urllib .parse as urllib_parse
1415
1516from pyld .jsonld import (JsonLdError , parse_link_header , LINK_HEADER_REL )
1617
1718
18- def requests_document_loader (secure = False , ** kwargs ):
19+ def requests_document_loader (secure = False , max_link_follows = 2 , ** kwargs ):
1920 """
2021 Create a Requests document loader.
21-
2222 Can be used to setup extra Requests args such as verify, cert, timeout,
2323 or others.
24-
2524 :param secure: require all requests to use HTTPS (default: False).
25+ :param max_link_follows: Maximum number of alternate link follows allowed.
2626 :param **kwargs: extra keyword args for Requests get() call.
27-
2827 :return: the RemoteDocument loader function.
2928 """
3029 import requests
3130
32- def loader (url , options = {}):
31+ def loader (url , options = {}, link_follow_count = 0 ):
3332 """
3433 Retrieves JSON-LD at the given URL.
35-
3634 :param url: the URL to retrieve.
37-
3835 :return: the RemoteDocument.
3936 """
4037 try :
4138 # validate URL
4239 pieces = urllib_parse .urlparse (url )
4340 if (not all ([pieces .scheme , pieces .netloc ]) or
44- pieces .scheme not in ['http' , 'https' ] or
45- set (pieces .netloc ) > set (
46- string .ascii_letters + string .digits + '-.:' )):
41+ pieces .scheme not in ['http' , 'https' ] or
42+ set (pieces .netloc ) > set (
43+ string .ascii_letters + string .digits + '-.:' )):
4744 raise JsonLdError (
4845 'URL could not be dereferenced; only "http" and "https" '
4946 'URLs are supported.' ,
@@ -69,30 +66,40 @@ def loader(url, options={}):
6966 'contentType' : content_type ,
7067 'contextUrl' : None ,
7168 'documentUrl' : response .url ,
72- 'document' : response . json ()
69+ 'document' : None
7370 }
71+ try :
72+ doc ['document' ] = response .json ()
73+ except json .JSONDecodeError as e :
74+ # document body is not parseable, continue to check link headers
75+ pass
76+ # if content_type in headers['Accept']:
77+ # doc['document'] = response.json()
7478 link_header = response .headers .get ('link' )
7579 if link_header :
7680 linked_context = parse_link_header (link_header ).get (
7781 LINK_HEADER_REL )
7882 # only 1 related link header permitted
7983 if linked_context and content_type != 'application/ld+json' :
80- if isinstance (linked_context , list ):
81- raise JsonLdError (
82- 'URL could not be dereferenced, '
83- 'it has more than one '
84- 'associated HTTP Link Header.' ,
85- 'jsonld.LoadDocumentError' ,
86- {'url' : url },
87- code = 'multiple context link headers' )
88- doc ['contextUrl' ] = linked_context ['target' ]
84+ if isinstance (linked_context , list ):
85+ raise JsonLdError (
86+ 'URL could not be dereferenced, '
87+ 'it has more than one '
88+ 'associated HTTP Link Header.' ,
89+ 'jsonld.LoadDocumentError' ,
90+ {'url' : url },
91+ code = 'multiple context link headers' )
92+ doc ['contextUrl' ] = linked_context ['target' ]
8993 linked_alternate = parse_link_header (link_header ).get ('alternate' )
9094 # if not JSON-LD, alternate may point there
9195 if (linked_alternate and
9296 linked_alternate .get ('type' ) == 'application/ld+json' and
9397 not re .match (r'^application\/(\w*\+)?json$' , content_type )):
9498 doc ['contentType' ] = 'application/ld+json'
9599 doc ['documentUrl' ] = jsonld .prepend_base (url , linked_alternate ['target' ])
100+ if link_follow_count >= max_link_follows :
101+ raise requests .TooManyRedirects (f"Exceeded maximum link header redirects ({ max_link_follows } )" )
102+ return loader (doc ['documentUrl' ], options = options , link_follow_count = link_follow_count + 1 )
96103 return doc
97104 except JsonLdError as e :
98105 raise e
0 commit comments