Skip to content

Commit 7f424fa

Browse files
authored
Merge pull request #104 from hackrush01/master
Replaced deprecated functions
2 parents f0b4777 + 4de23f7 commit 7f424fa

File tree

1 file changed

+5
-6
lines changed

1 file changed

+5
-6
lines changed

scrapely/extractors.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@
77
from six.moves.urllib.parse import urlparse, urlunparse
88
from six import unichr
99

10-
from w3lib.html import remove_entities, remove_comments
10+
from w3lib.html import replace_entities, remove_comments
1111
from w3lib.url import safe_url_string
1212

1313
from scrapely.htmlpage import HtmlPage, HtmlTag, HtmlTagType
1414

15-
#FIXME: the use of "." needs to be localized
1615
_NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U)
1716
_PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])')
1817
_NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)')
@@ -103,7 +102,7 @@ def text(region):
103102
>>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
104103
u'The text is here'
105104
"""
106-
text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
105+
text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
107106
return _WS.sub(u' ', text).strip()
108107

109108

@@ -272,7 +271,7 @@ def extract_number(txt):
272271
273272
It will handle unescaped entities:
274273
>>> extract_number(u'&#163;129&#46;99')
275-
'129.99'
274+
u'129.99'
276275
"""
277276
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
278277
numbers = _NUMBER_RE.findall(txt)
@@ -315,7 +314,7 @@ def extract_price(txt):
315314
>>> extract_price('500 000,00')
316315
'500000.00'
317316
>>> extract_price(u'&#163;129&#46;99')
318-
'129.99'
317+
u'129.99'
319318
>>> extract_price('adsfg')
320319
>>> extract_price('stained, linseed oil finish, clear glas doors')
321320
>>> extract_price('')
@@ -412,7 +411,7 @@ def image_url(txt):
412411
413412
"""
414413
imgurl = extract_image_url(txt)
415-
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
414+
return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
416415

417416

418417
def extract_image_url(txt):

0 commit comments

Comments
 (0)