Skip to content

Commit 7a8db78

Browse files
authored
Merge pull request #3302 from ParfenovS/main
2 parents 7e835a1 + e4aaaf5 commit 7a8db78

File tree

5 files changed

+275
-133
lines changed

5 files changed

+275
-133
lines changed

CHANGES.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ linelists.cdms
156156

157157
- Add a keyword to control writing of new species cache files.
158158
This is needed to prevent tests from overwriting those files. [#3297]
159+
- Add more complete support for CDMS quantum number and other value parsing. [#3302]
159160

160161
mast
161162
^^^^

astroquery/linelists/cdms/core.py

Lines changed: 138 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# import configurable items declared in __init__.py
1313
from astroquery.linelists.cdms import conf
1414
from astroquery.exceptions import InvalidQueryError, EmptyResponseError
15+
from astroquery import log
1516

1617
import re
1718
import string
@@ -31,7 +32,7 @@ class CDMSClass(BaseQuery):
3132
SERVER = conf.server
3233
CLASSIC_URL = conf.classic_server
3334
TIMEOUT = conf.timeout
34-
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']
35+
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028528 H2NC', '058501 H2C2S', '064527 HC3HCN']
3536

3637
def query_lines_async(self, min_frequency, max_frequency, *,
3738
min_strength=-500, molecule='All',
@@ -54,7 +55,8 @@ def query_lines_async(self, min_frequency, max_frequency, *,
5455
min_strength : int, optional
5556
Minimum strength in catalog units, the default is -500
5657
57-
molecule : list, string of regex if parse_name_locally=True, optional
58+
molecule : list or string if parse_name_locally=False,
59+
string of regex if parse_name_locally=True, optional
5860
Identifiers of the molecules to search for. If this parameter
5961
is not provided the search will match any species. Default is 'All'.
6062
As a first pass, the molecule will be searched for with a direct
@@ -134,18 +136,21 @@ def query_lines_async(self, min_frequency, max_frequency, *,
134136
# changes interpretation of query
135137
self._last_query_temperature = temperature_for_intensity
136138

137-
if molecule is not None:
138-
if parse_name_locally:
139-
self.lookup_ids = build_lookup()
140-
luts = self.lookup_ids.find(molecule, flags)
141-
if len(luts) == 0:
142-
raise InvalidQueryError('No matching species found. Please '
143-
'refine your search or read the Docs '
144-
'for pointers on how to search.')
145-
payload['Molecules'] = tuple(f"{val:06d} {key}"
146-
for key, val in luts.items())[0]
147-
else:
148-
payload['Molecules'] = molecule
139+
if molecule == 'All':
140+
payload['Moleculesgrp'] = 'all species'
141+
else:
142+
if molecule is not None:
143+
if parse_name_locally:
144+
self.lookup_ids = build_lookup()
145+
luts = self.lookup_ids.find(molecule, flags)
146+
if len(luts) == 0:
147+
raise InvalidQueryError('No matching species found. Please '
148+
'refine your search or read the Docs '
149+
'for pointers on how to search.')
150+
payload['Molecules'] = tuple(f"{val:06d} {key}"
151+
for key, val in luts.items())[0]
152+
else:
153+
payload['Molecules'] = molecule
149154

150155
if get_query_payload:
151156
return payload
@@ -180,7 +185,7 @@ def query_lines_async(self, min_frequency, max_frequency, *,
180185
# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
181186
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
182187
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
183-
if payload['Molecules'] in badlist:
188+
if 'Moleculesgrp' not in payload.keys() and payload['Molecules'] in badlist:
184189
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
185190
f"Try get_molecule({payload['Molecules']}) instead.")
186191

@@ -233,15 +238,32 @@ def _parse_result(self, response, *, verbose=False):
233238
soup = BeautifulSoup(response.text, 'html.parser')
234239
text = soup.find('pre').text
235240

241+
need_to_filter_bad_molecules = False
242+
for bad_molecule in self.MALFORMATTED_MOLECULE_LIST:
243+
if text.find(bad_molecule.split()[1]) > -1:
244+
need_to_filter_bad_molecules = True
245+
break
246+
if need_to_filter_bad_molecules:
247+
text_new = ''
248+
text = text.split('\n')
249+
for line in text:
250+
need_to_include_line = True
251+
for bad_molecule in self.MALFORMATTED_MOLECULE_LIST:
252+
if line.find(bad_molecule.split()[1]) > -1:
253+
need_to_include_line = False
254+
break
255+
if need_to_include_line:
256+
text_new = text_new + '\n' + line
257+
text = text_new
258+
236259
starts = {'FREQ': 0,
237260
'ERR': 14,
238261
'LGINT': 25,
239262
'DR': 36,
240263
'ELO': 38,
241264
'GUP': 47,
242-
'MOLWT': 51,
243-
'TAG': 54,
244-
'QNFMT': 58,
265+
'TAG': 50,
266+
'QNFMT': 57,
245267
'Ju': 61,
246268
'Ku': 63,
247269
'vu': 65,
@@ -256,39 +278,47 @@ def _parse_result(self, response, *, verbose=False):
256278
'F3l': 83,
257279
'name': 89}
258280

259-
result = ascii.read(text, header_start=None, data_start=0,
260-
comment=r'THIS|^\s{12,14}\d{4,6}.*',
261-
names=list(starts.keys()),
262-
col_starts=list(starts.values()),
263-
format='fixed_width', fast_reader=False)
264-
265-
result['FREQ'].unit = u.MHz
266-
result['ERR'].unit = u.MHz
267-
268-
result['Lab'] = result['MOLWT'] < 0
269-
result['MOLWT'] = np.abs(result['MOLWT'])
270-
result['MOLWT'].unit = u.Da
271-
272-
fix_keys = ['GUP']
273-
for suf in 'ul':
274-
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
275-
qnind = qn+suf
276-
fix_keys.append(qnind)
277-
for key in fix_keys:
278-
if not np.issubdtype(result[key].dtype, np.integer):
279-
intcol = np.array(list(map(parse_letternumber, result[key])),
280-
dtype=int)
281-
result[key] = intcol
282-
283-
# if there is a crash at this step, something went wrong with the query
284-
# and the _last_query_temperature was not set. This shouldn't ever
285-
# happen, but, well, I anticipate it will.
286-
if self._last_query_temperature == 0:
287-
result.rename_column('LGINT', 'LGAIJ')
288-
result['LGAIJ'].unit = u.s**-1
289-
else:
290-
result['LGINT'].unit = u.nm**2 * u.MHz
291-
result['ELO'].unit = u.cm**(-1)
281+
try:
282+
result = ascii.read(text, header_start=None, data_start=0,
283+
comment=r'THIS|^\s{12,14}\d{4,6}.*',
284+
names=list(starts.keys()),
285+
col_starts=list(starts.values()),
286+
format='fixed_width', fast_reader=False)
287+
288+
result['FREQ'].unit = u.MHz
289+
result['ERR'].unit = u.MHz
290+
291+
result['MOLWT'] = [int(x/1e3) for x in result['TAG']]
292+
result['Lab'] = result['MOLWT'] < 0
293+
result['MOLWT'] = np.abs(result['MOLWT'])
294+
result['MOLWT'].unit = u.Da
295+
296+
fix_keys = ['GUP']
297+
for suf in 'ul':
298+
for qn in ('J', 'v', 'K', 'F1', 'F2', 'F3'):
299+
qnind = qn+suf
300+
fix_keys.append(qnind)
301+
for key in fix_keys:
302+
if not np.issubdtype(result[key].dtype, np.integer):
303+
intcol = np.array(list(map(parse_letternumber, result[key])),
304+
dtype=int)
305+
result[key] = intcol
306+
307+
# if there is a crash at this step, something went wrong with the query
308+
# and the _last_query_temperature was not set. This shouldn't ever
309+
# happen, but, well, I anticipate it will.
310+
if self._last_query_temperature == 0:
311+
result.rename_column('LGINT', 'LGAIJ')
312+
result['LGAIJ'].unit = u.s**-1
313+
else:
314+
result['LGINT'].unit = u.nm**2 * u.MHz
315+
result['ELO'].unit = u.cm**(-1)
316+
except ValueError as ex:
317+
# Give users a more helpful exception when parsing fails
318+
new_message = ("Failed to parse CDMS response. This may be caused by a malformed search return. "
319+
"You can check this by running `CDMS.get_molecule('<id>')` instead; if it works, the "
320+
"problem is caused by the CDMS search interface and cannot be worked around.")
321+
raise ValueError(new_message) from ex
292322

293323
return result
294324

@@ -387,35 +417,50 @@ def tryfloat(x):
387417

388418
return result
389419

390-
def get_molecule(self, molecule_id, *, cache=True):
420+
def get_molecule(self, molecule_id, *, cache=True, return_response=False):
391421
"""
392422
Retrieve the whole molecule table for a given molecule id
423+
424+
Parameters
425+
----------
426+
molecule_id : str
427+
The 6-digit molecule identifier as a string
428+
cache : bool
429+
Defaults to True. If set overrides global caching behavior.
430+
See :ref:`caching documentation <astroquery_cache>`.
431+
return_response : bool, optional
432+
If True, return the raw `requests.Response` object instead of parsing
433+
the response. If this is set, the response will be returned whether
434+
or not it was successful. Default is False.
393435
"""
394436
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
395437
raise ValueError("molecule_id should be a length-6 string of numbers")
396438
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
397439
response = self._request(method='GET', url=url,
398440
timeout=self.TIMEOUT, cache=cache)
399-
result = self._parse_cat(response)
441+
442+
if return_response:
443+
return response
444+
445+
response.raise_for_status()
446+
447+
if 'Zero lines were found' in response.text:
448+
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")
449+
450+
result = self._parse_cat(response.text)
400451

401452
species_table = self.get_species_table()
402453
result.meta = dict(species_table.loc[int(molecule_id)])
403454

404455
return result
405456

406-
def _parse_cat(self, response, *, verbose=False):
457+
def _parse_cat(self, text, *, verbose=False):
407458
"""
408459
Parse a catalog response into an `~astropy.table.Table`
409460
410461
See details in _parse_response; this is a very similar function,
411462
but the catalog responses have a slightly different format.
412463
"""
413-
414-
if 'Zero lines were found' in response.text:
415-
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")
416-
417-
text = response.text
418-
419464
# notes about the format
420465
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
421466
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
@@ -426,21 +471,21 @@ def _parse_cat(self, response, *, verbose=False):
426471
'ELO': 32,
427472
'GUP': 42,
428473
'TAG': 44,
429-
'QNFMT': 52,
430-
'Q1': 56,
431-
'Q2': 58,
432-
'Q3': 60,
433-
'Q4': 62,
434-
'Q5': 64,
435-
'Q6': 66,
436-
'Q7': 68,
437-
'Q8': 70,
438-
'Q9': 72,
439-
'Q10': 74,
440-
'Q11': 76,
441-
'Q12': 78,
442-
'Q13': 80,
443-
'Q14': 82,
474+
'QNFMT': 51,
475+
'Q1': 55,
476+
'Q2': 57,
477+
'Q3': 59,
478+
'Q4': 61,
479+
'Q5': 63,
480+
'Q6': 65,
481+
'Q7': 67,
482+
'Q8': 69,
483+
'Q9': 71,
484+
'Q10': 73,
485+
'Q11': 75,
486+
'Q12': 77,
487+
'Q13': 79,
488+
'Q14': 81,
444489
}
445490

446491
result = ascii.read(text, header_start=None, data_start=0,
@@ -450,7 +495,7 @@ def _parse_cat(self, response, *, verbose=False):
450495
format='fixed_width', fast_reader=False)
451496

452497
# int truncates - which is what we want
453-
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]
498+
result['MOLWT'] = [int(x/1e3) for x in result['TAG']]
454499

455500
result['FREQ'].unit = u.MHz
456501
result['ERR'].unit = u.MHz
@@ -460,15 +505,18 @@ def _parse_cat(self, response, *, verbose=False):
460505
result['MOLWT'].unit = u.Da
461506

462507
fix_keys = ['GUP']
463-
for suf in '':
464-
for qn in (f'Q{ii}' for ii in range(1, 15)):
465-
qnind = qn+suf
466-
fix_keys.append(qnind)
508+
for qn in (f'Q{ii}' for ii in range(1, 15)):
509+
fix_keys.append(qn)
510+
log.debug(f"fix_keys: {fix_keys} should include Q1, Q2, ..., Q14 and GUP")
467511
for key in fix_keys:
468512
if not np.issubdtype(result[key].dtype, np.integer):
469513
intcol = np.array(list(map(parse_letternumber, result[key])),
470514
dtype=int)
515+
if any(intcol == -999999):
516+
intcol = np.ma.masked_where(intcol == -999999, intcol)
471517
result[key] = intcol
518+
if not np.issubdtype(result[key].dtype, np.integer):
519+
raise ValueError(f"Failed to parse {key} as integer")
472520

473521
result['LGINT'].unit = u.nm**2 * u.MHz
474522
result['ELO'].unit = u.cm**(-1)
@@ -481,18 +529,23 @@ def _parse_cat(self, response, *, verbose=False):
481529

482530
def parse_letternumber(st):
483531
"""
484-
Parse CDMS's two-letter QNs
532+
Parse CDMS's two-letter QNs into integers.
533+
534+
Masked values are converted to -999999.
485535
486536
From the CDMS docs:
487537
"Exactly two characters are available for each quantum number. Therefore, half
488538
integer quanta are rounded up ! In addition, capital letters are used to
489-
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Small
490-
types are used to signal corresponding negative quantum numbers."
539+
indicate quantum numbers larger than 99. E. g. A0 is 100, Z9 is 359. Lower case characters
540+
are used similarly to signal negative quantum numbers smaller than –9. e. g., a0 is –10, b0 is –20, etc."
491541
"""
542+
if np.ma.is_masked(st):
543+
return -999999
544+
492545
asc = string.ascii_lowercase
493546
ASC = string.ascii_uppercase
494-
newst = ''.join(['-' + str(asc.index(x)+10) if x in asc else
495-
str(ASC.index(x)+10) if x in ASC else
547+
newst = ''.join(['-' + str((asc.index(x)+1)) if x in asc else
548+
str((ASC.index(x)+10)) if x in ASC else
496549
x for x in st])
497550
return int(newst)
498551

astroquery/linelists/cdms/tests/test_cdms.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def test_query(patch_post):
8383
assert tbl['LGINT'][0] == -7.1425
8484
assert tbl['GUP'][0] == 3
8585
assert tbl['GUP'][7] == 17
86+
assert tbl['MOLWT'][0] == 28
8687

8788

8889
def test_parseletternumber():
@@ -99,9 +100,12 @@ def test_parseletternumber():
99100
assert parse_letternumber("Z9") == 359
100101

101102
# inferred?
102-
assert parse_letternumber("z9") == -359
103+
assert parse_letternumber("a0") == -10
104+
assert parse_letternumber("b0") == -20
103105
assert parse_letternumber("ZZ") == 3535
104106

107+
assert parse_letternumber(np.ma.masked) == -999999
108+
105109

106110
def test_hc7s(patch_post):
107111
"""

0 commit comments

Comments
 (0)