|
| 1 | +from __future__ import unicode_literals |
| 2 | + |
| 3 | +import collections |
| 4 | +import os |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | +import pandas.compat as compat |
| 9 | + |
| 10 | +from pandas_datareader.io.util import _read_content |
| 11 | + |
| 12 | + |
| 13 | +_STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}' |
| 14 | +_MESSAGE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}' |
| 15 | +_GENERIC = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}' |
| 16 | +_COMMON = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}' |
| 17 | +_XML = '{http://www.w3.org/XML/1998/namespace}' |
| 18 | + |
| 19 | +_DATASET = _MESSAGE + 'DataSet' |
| 20 | +_SERIES = _GENERIC + 'Series' |
| 21 | +_SERIES_KEY = _GENERIC + 'SeriesKey' |
| 22 | +_OBSERVATION = _GENERIC + 'Obs' |
| 23 | +_VALUE = _GENERIC + 'Value' |
| 24 | +_OBSDIMENSION = _GENERIC + 'ObsDimension' |
| 25 | +_OBSVALUE = _GENERIC + 'ObsValue' |
| 26 | +_CODE = _STRUCTURE + 'Code' |
| 27 | +_TIMEDIMENSION = _STRUCTURE + 'TimeDimension' |
| 28 | + |
| 29 | + |
| 30 | +def read_sdmx(path_or_buf, dtype='float64', dsd=None): |
| 31 | + """ |
| 32 | + Convert a SDMX-XML string to pandas object |
| 33 | +
|
| 34 | + Parameters |
| 35 | + ---------- |
| 36 | + filepath_or_buffer : a valid SDMX-XML string or file-like |
| 37 | + https://webgate.ec.europa.eu/fpfis/mwikis/sdmx/index.php/Main_Page |
| 38 | + dtype : str |
| 39 | + dtype to coerce values |
| 40 | + dsd : dict |
| 41 | + parsed DSD dict corresponding to the SDMX-XML data |
| 42 | +
|
| 43 | + Returns |
| 44 | + ------- |
| 45 | + results : Series, DataFrame, or dictionaly of Series or DataFrame. |
| 46 | + """ |
| 47 | + |
| 48 | + xdata = _read_content(path_or_buf) |
| 49 | + |
| 50 | + import xml.etree.ElementTree as ET |
| 51 | + root = ET.fromstring(xdata) |
| 52 | + |
| 53 | + structure = _get_child(root, _MESSAGE + 'Structure') |
| 54 | + idx_name = structure.get('dimensionAtObservation') |
| 55 | + dataset = _get_child(root, _DATASET) |
| 56 | + |
| 57 | + keys = [] |
| 58 | + obss = [] |
| 59 | + |
| 60 | + for series in dataset.iter(_SERIES): |
| 61 | + key = _parse_series_key(series) |
| 62 | + obs = _parse_observations(series.iter(_OBSERVATION)) |
| 63 | + keys.append(key) |
| 64 | + obss.append(obs) |
| 65 | + |
| 66 | + mcols = _construct_index(keys, dsd=dsd) |
| 67 | + mseries = _construct_series(obss, name=idx_name, dsd=dsd) |
| 68 | + |
| 69 | + df = pd.DataFrame(mseries, dtype=dtype) |
| 70 | + df = df.T |
| 71 | + df.columns = mcols |
| 72 | + |
| 73 | + return df |
| 74 | + |
| 75 | + |
| 76 | +def _construct_series(values, name, dsd=None): |
| 77 | + |
| 78 | + # ts defines attributes to be handled as times |
| 79 | + times = dsd.ts if dsd is not None else [] |
| 80 | + |
| 81 | + if len(values) < 1: |
| 82 | + raise ValueError("Data contains no 'Series'") |
| 83 | + results = [] |
| 84 | + for value in values: |
| 85 | + |
| 86 | + if name in times: |
| 87 | + idx = pd.DatetimeIndex([v[0] for v in value], name=name) |
| 88 | + else: |
| 89 | + idx = pd.Index([v[0] for v in value], name=name) |
| 90 | + |
| 91 | + results.append(pd.Series([v[1] for v in value], index=idx)) |
| 92 | + return results |
| 93 | + |
| 94 | + |
| 95 | +def _construct_index(keys, dsd=None): |
| 96 | + |
| 97 | + # code defines a mapping to key's internal code to its representation |
| 98 | + codes = dsd.codes if dsd is not None else {} |
| 99 | + |
| 100 | + if len(keys) < 1: |
| 101 | + raise ValueError("Data contains no 'Series'") |
| 102 | + names = [t[0] for t in keys[0]] |
| 103 | + values = {} |
| 104 | + # initialize |
| 105 | + for key in keys: |
| 106 | + for name, value in key: |
| 107 | + # apply DSD |
| 108 | + try: |
| 109 | + value = codes[name][value] |
| 110 | + except KeyError: |
| 111 | + pass |
| 112 | + |
| 113 | + try: |
| 114 | + values[name].append(value) |
| 115 | + except KeyError: |
| 116 | + values[name] = [value] |
| 117 | + |
| 118 | + midx = pd.MultiIndex.from_arrays([values[name] for name in names], names=names) |
| 119 | + return midx |
| 120 | + |
| 121 | + |
| 122 | +def _parse_observations(observations): |
| 123 | + results = [] |
| 124 | + for observation in observations: |
| 125 | + obsdimension = _get_child(observation, _OBSDIMENSION) |
| 126 | + obsvalue = _get_child(observation, _OBSVALUE) |
| 127 | + results.append((obsdimension.get('value'), obsvalue.get('value'))) |
| 128 | + # return list of key/value tuple, eg: [(key, value), ...] |
| 129 | + return results |
| 130 | + |
| 131 | + |
| 132 | +def _parse_series_key(series): |
| 133 | + serieskey = _get_child(series, _SERIES_KEY) |
| 134 | + key_values = serieskey.iter(_VALUE) |
| 135 | + keys = [(key.get('id'), key.get('value')) for key in key_values] |
| 136 | + # return list of key/value tuple, eg: [(key, value), ...] |
| 137 | + return keys |
| 138 | + |
| 139 | + |
| 140 | +def _get_child(element, key): |
| 141 | + elements = list(element.iter(key)) |
| 142 | + if len(elements) == 1: |
| 143 | + return elements[0] |
| 144 | + elif len(elements) == 0: |
| 145 | + raise ValueError("Element {0} contains no {1}".format(element.tag, key)) |
| 146 | + else: |
| 147 | + raise ValueError("Element {0} contains multiple {1}".format(element.tag, key)) |
| 148 | + |
| 149 | + |
| 150 | +_NAME_EN = ".//{0}Name[@{1}lang='en']".format(_COMMON, _XML) |
| 151 | + |
| 152 | + |
| 153 | +def _get_english_name(element): |
| 154 | + name = element.find(_NAME_EN).text |
| 155 | + return name |
| 156 | + |
| 157 | + |
| 158 | + |
| 159 | +SDMXCode = collections.namedtuple('SDMXCode', ['codes', 'ts']) |
| 160 | + |
| 161 | + |
| 162 | +def _read_sdmx_dsd(path_or_buf): |
| 163 | + """ |
| 164 | + Convert a SDMX-XML DSD string to mapping dictionary |
| 165 | +
|
| 166 | + Parameters |
| 167 | + ---------- |
| 168 | + filepath_or_buffer : a valid SDMX-XML DSD string or file-like |
| 169 | + https://webgate.ec.europa.eu/fpfis/mwikis/sdmx/index.php/Main_Page |
| 170 | +
|
| 171 | + Returns |
| 172 | + ------- |
| 173 | + results : namedtuple (SDMXCode) |
| 174 | + """ |
| 175 | + |
| 176 | + xdata = _read_content(path_or_buf) |
| 177 | + |
| 178 | + import xml.etree.cElementTree as ET |
| 179 | + root = ET.fromstring(xdata) |
| 180 | + |
| 181 | + structure = _get_child(root, _MESSAGE + 'Structures') |
| 182 | + codes = _get_child(structure, _STRUCTURE + 'Codelists') |
| 183 | + # concepts = _get_child(structure, _STRUCTURE + 'Concepts') |
| 184 | + datastructures = _get_child(structure, _STRUCTURE + 'DataStructures') |
| 185 | + |
| 186 | + code_results = {} |
| 187 | + for codelist in codes: |
| 188 | + # codelist_id = codelist.get('id') |
| 189 | + codelist_name = _get_english_name(codelist) |
| 190 | + mapper = {} |
| 191 | + for code in codelist.iter(_CODE): |
| 192 | + code_id = code.get('id') |
| 193 | + name = _get_english_name(code) |
| 194 | + mapper[code_id] = name |
| 195 | + # codeobj = SDMXCode(id=codelist_id, name=codelist_name, mapper=mapper) |
| 196 | + # code_results[codelist_id] = codeobj |
| 197 | + code_results[codelist_name] = mapper |
| 198 | + |
| 199 | + times = list(datastructures.iter(_TIMEDIMENSION)) |
| 200 | + times = [t.get('id') for t in times] |
| 201 | + |
| 202 | + result = SDMXCode(codes=code_results, ts=times) |
| 203 | + return result |
0 commit comments