Skip to content

Commit 0f300c4

Browse files
committed
Improved price extraction
The prices are now handled using regexp, strings and loops instead of only regex which was inaccurate in some cases. Added test cases. Removed unnecessary regexp part Fixes scrapinghub/portia#212
1 parent 4e2a6ec commit 0f300c4

File tree

1 file changed

+47
-11
lines changed

1 file changed

+47
-11
lines changed

scrapely/extractors.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
_NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U)
1717
_PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])')
1818
_NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)')
19-
_DECIMAL_RE = re.compile(r'(\d[\d\,]*(?:(?:\.\d+)|(?:)))', re.U | re.M)
20-
_VALPARTS_RE = re.compile("([\.,]?\d+)")
19+
_DECIMAL_RE = re.compile(r'(-?\d[\d\,\.]*)', re.U | re.M)
2120

2221
_IMAGES = (
2322
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
@@ -273,7 +272,7 @@ def extract_number(txt):
273272
274273
It will handle unescaped entities:
275274
>>> extract_number(u'£129.99')
276-
u'129.99'
275+
'129.99'
277276
"""
278277
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
279278
numbers = _NUMBER_RE.findall(txt)
@@ -295,22 +294,59 @@ def extract_price(txt):
295294
'2234'
296295
>>> extract_price('947')
297296
'947'
297+
>>> extract_price('-200,069,000,006.565456')
298+
'-200069000006.565456'
299+
>>> extract_price('1,000,000')
300+
'1000000'
301+
>>> extract_price('1,000,000.00')
302+
'1000000.00'
303+
>>> extract_price('1,000')
304+
'1000'
305+
>>> extract_price('1000,00')
306+
'1000.00'
307+
>>> extract_price('1,000.00')
308+
'1000.00'
309+
>>> extract_price('500,000.00')
310+
'500000.00'
311+
>>> extract_price('500.000,00')
312+
'500000.00'
313+
>>> extract_price('-500,000.00')
314+
'-500000.00'
315+
>>> extract_price('500 000,00')
316+
'500000.00'
317+
>>> extract_price(u'£129.99')
318+
'129.99'
298319
>>> extract_price('adsfg')
299320
>>> extract_price('stained, linseed oil finish, clear glas doors')
300321
>>> extract_price('')
301-
>>> extract_price(u'£129.99')
302-
u'129.99'
303322
"""
304323
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
324+
txt = txt.replace(' ', '')
305325
m = _DECIMAL_RE.search(txt)
326+
POINT, COMMA = 0, 1
327+
decimal_separator = POINT
328+
306329
if m:
307330
value = m.group(1)
308-
parts = _VALPARTS_RE.findall(value)
309-
decimalpart = parts.pop(-1)
310-
if decimalpart[0] == "," and len(decimalpart) <= 3:
311-
decimalpart = decimalpart.replace(",", ".")
312-
value = "".join(parts + [decimalpart]).replace(",", "")
313-
return value
331+
last_point_idx = value.rfind('.')
332+
last_comma_idx = value.rfind(',')
333+
334+
# If a number has both separators take the last one
335+
if last_point_idx > 0 and last_comma_idx > 0:
336+
if last_comma_idx > last_point_idx:
337+
decimal_separator = COMMA
338+
# If a number has only commas check the last one
339+
elif last_comma_idx > 0:
340+
first_comma_idx = value.find(',')
341+
if (first_comma_idx == last_comma_idx and
342+
len(value) - last_comma_idx <= 3):
343+
decimal_separator = COMMA
344+
345+
if decimal_separator == POINT:
346+
value = value.replace(',', '')
347+
else:
348+
value = value.replace('.', '')
349+
return value.replace(',', '.')
314350

315351

316352
def url(txt):

0 commit comments

Comments
 (0)