|
7 | 7 | from six.moves.urllib.parse import urlparse, urlunparse |
8 | 8 | from six import unichr |
9 | 9 |
|
10 | | -from w3lib.html import remove_entities, remove_comments |
| 10 | +from w3lib.html import replace_entities, remove_comments |
11 | 11 | from w3lib.url import safe_url_string |
12 | 12 |
|
13 | 13 | from scrapely.htmlpage import HtmlPage, HtmlTag, HtmlTagType |
14 | 14 |
|
15 | | -#FIXME: the use of "." needs to be localized |
16 | 15 | _NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U) |
17 | 16 | _PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])') |
18 | 17 | _NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)') |
@@ -103,7 +102,7 @@ def text(region): |
103 | 102 | >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>") |
104 | 103 | u'The text is here' |
105 | 104 | """ |
106 | | - text = remove_entities(region.text_content, encoding=region.htmlpage.encoding) |
| 105 | + text = replace_entities(region.text_content, encoding=region.htmlpage.encoding) |
107 | 106 | return _WS.sub(u' ', text).strip() |
108 | 107 |
|
109 | 108 |
|
@@ -272,7 +271,7 @@ def extract_number(txt): |
272 | 271 |
|
273 | 272 | It will handle unescaped entities: |
274 | 273 | >>> extract_number(u'£129.99') |
275 | | - '129.99' |
| 274 | + u'129.99' |
276 | 275 | """ |
277 | 276 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) |
278 | 277 | numbers = _NUMBER_RE.findall(txt) |
@@ -315,7 +314,7 @@ def extract_price(txt): |
315 | 314 | >>> extract_price('500 000,00') |
316 | 315 | '500000.00' |
317 | 316 | >>> extract_price(u'£129.99') |
318 | | - '129.99' |
| 317 | + u'129.99' |
319 | 318 | >>> extract_price('adsfg') |
320 | 319 | >>> extract_price('stained, linseed oil finish, clear glas doors') |
321 | 320 | >>> extract_price('') |
@@ -412,7 +411,7 @@ def image_url(txt): |
412 | 411 |
|
413 | 412 | """ |
414 | 413 | imgurl = extract_image_url(txt) |
415 | | - return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None |
| 414 | + return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None |
416 | 415 |
|
417 | 416 |
|
418 | 417 | def extract_image_url(txt): |
|
0 commit comments