Skip to content

Commit f0b4777

Browse files
authored
Merge pull request #103 from hackrush01/master
Improved price extraction
2 parents 4e2a6ec + 0f300c4 commit f0b4777

File tree

1 file changed

+47
-11
lines changed

1 file changed

+47
-11
lines changed

scrapely/extractors.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
_NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U)
1717
_PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])')
1818
_NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)')
19-
_DECIMAL_RE = re.compile(r'(\d[\d\,]*(?:(?:\.\d+)|(?:)))', re.U | re.M)
20-
_VALPARTS_RE = re.compile("([\.,]?\d+)")
19+
_DECIMAL_RE = re.compile(r'(-?\d[\d\,\.]*)', re.U | re.M)
2120

2221
_IMAGES = (
2322
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
@@ -273,7 +272,7 @@ def extract_number(txt):
273272
274273
It will handle unescaped entities:
275274
>>> extract_number(u'£129.99')
276-
u'129.99'
275+
'129.99'
277276
"""
278277
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
279278
numbers = _NUMBER_RE.findall(txt)
@@ -295,22 +294,59 @@ def extract_price(txt):
295294
'2234'
296295
>>> extract_price('947')
297296
'947'
297+
>>> extract_price('-200,069,000,006.565456')
298+
'-200069000006.565456'
299+
>>> extract_price('1,000,000')
300+
'1000000'
301+
>>> extract_price('1,000,000.00')
302+
'1000000.00'
303+
>>> extract_price('1,000')
304+
'1000'
305+
>>> extract_price('1000,00')
306+
'1000.00'
307+
>>> extract_price('1,000.00')
308+
'1000.00'
309+
>>> extract_price('500,000.00')
310+
'500000.00'
311+
>>> extract_price('500.000,00')
312+
'500000.00'
313+
>>> extract_price('-500,000.00')
314+
'-500000.00'
315+
>>> extract_price('500 000,00')
316+
'500000.00'
317+
>>> extract_price(u'£129.99')
318+
'129.99'
298319
>>> extract_price('adsfg')
299320
>>> extract_price('stained, linseed oil finish, clear glas doors')
300321
>>> extract_price('')
301-
>>> extract_price(u'£129.99')
302-
u'129.99'
303322
"""
304323
txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
324+
txt = txt.replace(' ', '')
305325
m = _DECIMAL_RE.search(txt)
326+
POINT, COMMA = 0, 1
327+
decimal_separator = POINT
328+
306329
if m:
307330
value = m.group(1)
308-
parts = _VALPARTS_RE.findall(value)
309-
decimalpart = parts.pop(-1)
310-
if decimalpart[0] == "," and len(decimalpart) <= 3:
311-
decimalpart = decimalpart.replace(",", ".")
312-
value = "".join(parts + [decimalpart]).replace(",", "")
313-
return value
331+
last_point_idx = value.rfind('.')
332+
last_comma_idx = value.rfind(',')
333+
334+
# If a number has both separators take the last one
335+
if last_point_idx > 0 and last_comma_idx > 0:
336+
if last_comma_idx > last_point_idx:
337+
decimal_separator = COMMA
338+
# If a number has only commas check the last one
339+
elif last_comma_idx > 0:
340+
first_comma_idx = value.find(',')
341+
if (first_comma_idx == last_comma_idx and
342+
len(value) - last_comma_idx <= 3):
343+
decimal_separator = COMMA
344+
345+
if decimal_separator == POINT:
346+
value = value.replace(',', '')
347+
else:
348+
value = value.replace('.', '')
349+
return value.replace(',', '.')
314350

315351

316352
def url(txt):

0 commit comments

Comments
 (0)