File tree Expand file tree Collapse file tree 6 files changed +47
-3
lines changed Expand file tree Collapse file tree 6 files changed +47
-3
lines changed Original file line number Diff line number Diff line change 66import json
77import re
88
9+ import jstyleson
910import lxml .etree
1011
1112from extruct .utils import parse_html
@@ -34,8 +35,7 @@ def _extract_items(self, node):
3435 data = json .loads (script , strict = False )
3536 except ValueError :
3637 # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
37- data = json .loads (
38- HTML_OR_JS_COMMENTLINE .sub ('' , script ), strict = False )
38+ data = jstyleson .loads (HTML_OR_JS_COMMENTLINE .sub ('' , script ),strict = False )
3939 if isinstance (data , list ):
4040 return data
4141 elif isinstance (data , dict ):
Original file line number Diff line number Diff line change @@ -8,3 +8,4 @@ mf2py>=1.1.0
88six >= 1.11
99w3lib
1010html-text
11+ jstyleson
Original file line number Diff line number Diff line change @@ -34,7 +34,9 @@ def get_version():
3434 'mf2py' ,
3535 'w3lib' ,
3636 'html-text>=0.5.1' ,
37- 'six' ],
37+ 'six' ,
38+ 'jstyleson'
39+ ],
3840 extras_require = {
3941 'cli' : [
4042 'requests' ,
Original file line number Diff line number Diff line change 1+ <!DOCTYPE html>
2+ < html lang ="en ">
3+
4+ < head >
5+ < script type ="application/ld+json ">
6+
7+ {
8+ "@context" : "http://schema.org" ,
9+ "@type" : "NewsArticle" ,
10+ "thumbnailUrl" : "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg" ,
11+ "keywords" : "" ,
12+ "url" : "https://money.udn.com/money/story/5635/4158094" ,
13+ "mainEntityOfPage" : "https://money.udn.com/money/story/5635/4158094" ,
14+ "headline" : "讓AI挑出感興趣 SparkAmplify精準行銷當紅" ,
15+ "articleSection" : "商情" , // category
16+ //"interactionCount": ""
17+ }
18+
19+ </ script >
20+ </ head >
21+
22+ < body > </ body >
23+
24+ </ html >
Original file line number Diff line number Diff line change 1+ [
2+ {
3+ "@context" : "http://schema.org" ,
4+ "@type" : "NewsArticle" ,
5+ "thumbnailUrl" : "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg" ,
6+ "keywords" : "" ,
7+ "url" : "https://money.udn.com/money/story/5635/4158094" ,
8+ "mainEntityOfPage" : "https://money.udn.com/money/story/5635/4158094" ,
9+ "headline" : "讓AI挑出感興趣 SparkAmplify精準行銷當紅" ,
10+ "articleSection" : "商情"
11+ }
12+ ]
Original file line number Diff line number Diff line change @@ -40,6 +40,11 @@ def test_jsonld_with_control_characters_comment(self):
4040 self .assertJsonLdCorrect (
4141 folder = 'custom.invalid' ,
4242 page = 'JSONLD_with_control_characters_comment' )
43+
44+ def test_jsonld_with_json_including_js_comment (self ):
45+ self .assertJsonLdCorrect (
46+ folder = 'custom.invalid' ,
47+ page = 'JSONLD_with_JS_comment' )
4348
4449 def assertJsonLdCorrect (self , folder , page ):
4550 body , expected = self ._get_body_expected (folder , page )
You can’t perform that action at this time.
0 commit comments