1616_NUMERIC_ENTITIES = re .compile ("&#([0-9]+)(?:;|\s)" , re .U )
1717_PRICE_NUMBER_RE = re .compile ('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])' )
1818_NUMBER_RE = re .compile ('(-?\d+(?:\.\d+)?)' )
19- _DECIMAL_RE = re .compile (r'(\d[\d\,]*(?:(?:\.\d+)|(?:)))' , re .U | re .M )
20- _VALPARTS_RE = re .compile ("([\.,]?\d+)" )
19+ _DECIMAL_RE = re .compile (r'(-?\d[\d\,\.]*)' , re .U | re .M )
2120
2221_IMAGES = (
2322 'mng' , 'pct' , 'bmp' , 'gif' , 'jpg' , 'jpeg' , 'png' , 'pst' , 'psp' , 'tif' ,
@@ -273,7 +272,7 @@ def extract_number(txt):
273272
274273 It will handle unescaped entities:
275274 >>> extract_number(u'£129.99')
276- u '129.99'
275+ '129.99'
277276 """
278277 txt = _NUMERIC_ENTITIES .sub (lambda m : unichr (int (m .groups ()[0 ])), txt )
279278 numbers = _NUMBER_RE .findall (txt )
@@ -295,22 +294,59 @@ def extract_price(txt):
295294 '2234'
296295 >>> extract_price('947')
297296 '947'
297+ >>> extract_price('-200,069,000,006.565456')
298+ '-200069000006.565456'
299+ >>> extract_price('1,000,000')
300+ '1000000'
301+ >>> extract_price('1,000,000.00')
302+ '1000000.00'
303+ >>> extract_price('1,000')
304+ '1000'
305+ >>> extract_price('1000,00')
306+ '1000.00'
307+ >>> extract_price('1,000.00')
308+ '1000.00'
309+ >>> extract_price('500,000.00')
310+ '500000.00'
311+ >>> extract_price('500.000,00')
312+ '500000.00'
313+ >>> extract_price('-500,000.00')
314+ '-500000.00'
315+ >>> extract_price('500 000,00')
316+ '500000.00'
317+ >>> extract_price(u'£129.99')
318+ '129.99'
298319 >>> extract_price('adsfg')
299320 >>> extract_price('stained, linseed oil finish, clear glas doors')
300321 >>> extract_price('')
301- >>> extract_price(u'£129.99')
302- u'129.99'
303322 """
304323 txt = _NUMERIC_ENTITIES .sub (lambda m : unichr (int (m .groups ()[0 ])), txt )
324+ txt = txt .replace (' ' , '' )
305325 m = _DECIMAL_RE .search (txt )
326+ POINT , COMMA = 0 , 1
327+ decimal_separator = POINT
328+
306329 if m :
307330 value = m .group (1 )
308- parts = _VALPARTS_RE .findall (value )
309- decimalpart = parts .pop (- 1 )
310- if decimalpart [0 ] == "," and len (decimalpart ) <= 3 :
311- decimalpart = decimalpart .replace ("," , "." )
312- value = "" .join (parts + [decimalpart ]).replace ("," , "" )
313- return value
331+ last_point_idx = value .rfind ('.' )
332+ last_comma_idx = value .rfind (',' )
333+
334+ # If a number has both separators take the last one
335+ if last_point_idx > 0 and last_comma_idx > 0 :
336+ if last_comma_idx > last_point_idx :
337+ decimal_separator = COMMA
338+ # If a number has only commas check the last one
339+ elif last_comma_idx > 0 :
340+ first_comma_idx = value .find (',' )
341+ if (first_comma_idx == last_comma_idx and
342+ len (value ) - last_comma_idx <= 3 ):
343+ decimal_separator = COMMA
344+
345+ if decimal_separator == POINT :
346+ value = value .replace (',' , '' )
347+ else :
348+ value = value .replace ('.' , '' )
349+ return value .replace (',' , '.' )
314350
315351
316352def url (txt ):
0 commit comments