Skip to content

Commit 15580a7

Browse files
committed
Addresses issue digitalbazaar#128 for requests document loader
1 parent f5d2814 commit 15580a7

File tree

1 file changed

+27
-20
lines changed

1 file changed

+27
-20
lines changed

lib/pyld/documentloader/requests.py

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,40 +10,37 @@
1010
.. moduleauthor:: Olaf Conradi <olaf@conradi.org>
1111
"""
1212
import string
13+
import re
1314
import urllib.parse as urllib_parse
1415

1516
from pyld.jsonld import (JsonLdError, parse_link_header, LINK_HEADER_REL)
1617

1718

18-
def requests_document_loader(secure=False, **kwargs):
19+
def requests_document_loader(secure=False, max_link_follows=2, **kwargs):
1920
"""
2021
Create a Requests document loader.
21-
2222
Can be used to setup extra Requests args such as verify, cert, timeout,
2323
or others.
24-
2524
:param secure: require all requests to use HTTPS (default: False).
25+
:param max_link_follows: Maximum number of alternate link follows allowed.
2626
:param **kwargs: extra keyword args for Requests get() call.
27-
2827
:return: the RemoteDocument loader function.
2928
"""
3029
import requests
3130

32-
def loader(url, options={}):
31+
def loader(url, options={}, link_follow_count=0):
3332
"""
3433
Retrieves JSON-LD at the given URL.
35-
3634
:param url: the URL to retrieve.
37-
3835
:return: the RemoteDocument.
3936
"""
4037
try:
4138
# validate URL
4239
pieces = urllib_parse.urlparse(url)
4340
if (not all([pieces.scheme, pieces.netloc]) or
44-
pieces.scheme not in ['http', 'https'] or
45-
set(pieces.netloc) > set(
46-
string.ascii_letters + string.digits + '-.:')):
41+
pieces.scheme not in ['http', 'https'] or
42+
set(pieces.netloc) > set(
43+
string.ascii_letters + string.digits + '-.:')):
4744
raise JsonLdError(
4845
'URL could not be dereferenced; only "http" and "https" '
4946
'URLs are supported.',
@@ -69,30 +66,40 @@ def loader(url, options={}):
6966
'contentType': content_type,
7067
'contextUrl': None,
7168
'documentUrl': response.url,
72-
'document': response.json()
69+
'document': None
7370
}
71+
try:
72+
doc['document'] = response.json()
73+
except json.JSONDecodeError as e:
74+
# document body is not parseable, continue to check link headers
75+
pass
76+
# if content_type in headers['Accept']:
77+
# doc['document'] = response.json()
7478
link_header = response.headers.get('link')
7579
if link_header:
7680
linked_context = parse_link_header(link_header).get(
7781
LINK_HEADER_REL)
7882
# only 1 related link header permitted
7983
if linked_context and content_type != 'application/ld+json':
80-
if isinstance(linked_context, list):
81-
raise JsonLdError(
82-
'URL could not be dereferenced, '
83-
'it has more than one '
84-
'associated HTTP Link Header.',
85-
'jsonld.LoadDocumentError',
86-
{'url': url},
87-
code='multiple context link headers')
88-
doc['contextUrl'] = linked_context['target']
84+
if isinstance(linked_context, list):
85+
raise JsonLdError(
86+
'URL could not be dereferenced, '
87+
'it has more than one '
88+
'associated HTTP Link Header.',
89+
'jsonld.LoadDocumentError',
90+
{'url': url},
91+
code='multiple context link headers')
92+
doc['contextUrl'] = linked_context['target']
8993
linked_alternate = parse_link_header(link_header).get('alternate')
9094
# if not JSON-LD, alternate may point there
9195
if (linked_alternate and
9296
linked_alternate.get('type') == 'application/ld+json' and
9397
not re.match(r'^application\/(\w*\+)?json$', content_type)):
9498
doc['contentType'] = 'application/ld+json'
9599
doc['documentUrl'] = jsonld.prepend_base(url, linked_alternate['target'])
100+
if link_follow_count >= max_link_follows:
101+
raise requests.TooManyRedirects(f"Exceeded maximum link header redirects ({max_link_follows})")
102+
return loader(doc['documentUrl'], options=options, link_follow_count=link_follow_count + 1)
96103
return doc
97104
except JsonLdError as e:
98105
raise e

0 commit comments

Comments
 (0)