Skip to content

Commit ccc5aa9

Browse files
Merge pull request #115 from sinhrks/eurostat
ENH: Support Eurostat
2 parents 402cd7f + 602b8d3 commit ccc5aa9

File tree

14 files changed

+776
-7
lines changed

14 files changed

+776
-7
lines changed

docs/source/remote_data.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Currently the following sources are supported:
2929
- :ref:`Kenneth French's data library<remote_data.ff>`
3030
- :ref:`World Bank<remote_data.wb>`
3131
- :ref:`OECD<remote_data.oecd>`
32+
- :ref:`Eurostat<remote_data.eurostat>`
3233

3334
It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ.
3435

@@ -365,3 +366,22 @@ example is to download "Trade Union Density" data which set code is "UN_DEN".
365366
366367
df[['Japan', 'United States']]
367368
369+
.. _remote_data.eurostat:
370+
371+
Eurostat
372+
========
373+
374+
`Eurostat <http://ec.europa.eu/eurostat/>`__ are avaliable via ``DataReader``.
375+
376+
Get ` Rail accidents by type of accident (ERA data) <http://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=tran_sf_railac&lang=en>`_ data. The result will be a ``DataFrame`` which has ``DatetimeIndex`` as index and ``MultiIndex`` of attributes or countries as column. The target URL is:
377+
378+
* http://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=tran_sf_railac&lang=en
379+
380+
You can specify dataset ID "tran_sf_railac" to get corresponding data via ``DataReader``.
381+
382+
.. ipython:: python
383+
384+
import pandas_datareader.data as web
385+
386+
df = web.DataReader("tran_sf_railac", 'eurostat')
387+
df

docs/source/whatsnew.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@ What's New
1818

1919
These are new features and improvements of note in each release.
2020

21+
.. include:: whatsnew/v0.2.1.txt
2122
.. include:: whatsnew/v0.2.0.txt
2223

docs/source/whatsnew/v0.2.0.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ New features
2222
- Added get_available_datasets to famafrench (:issue:`56`).
2323
- ``DataReader`` now supports OECD data sources, see :ref:`here<remote_data.oecd>` (:issue:`101`).
2424

25-
.. _whatsnew_0170.api:
25+
.. _whatsnew_020.api_breaking:
2626

2727
Backwards incompatible API changes
2828
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2929

30-
.. _whatsnew_020.api_breaking:
30+
3131
- Fama French indexes are not Pandas.PeriodIndex for annual and montly data, and
3232
pandas.DatetimeIndex otherwise (:issue:`56`).
3333

docs/source/whatsnew/v0.2.1.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
.. _whatsnew_021:
2+
3+
v0.2.1 (XXX)
4+
----------------------------
5+
6+
This is a minor release from 0.2.0 and includes new features and a number of bug fixes.
7+
8+
9+
Highlights include:
10+
11+
12+
.. contents:: What's new in v0.2.1
13+
:local:
14+
:backlinks: none
15+
16+
.. _whatsnew_020.enhancements:
17+
18+
New features
19+
~~~~~~~~~~~~
20+
21+
- ``DataReader`` now supports Eurostat data sources, see :ref:`here<remote_data.eurostat>` (:issue:`101`).
22+
23+
.. _whatsnew_021.api_breaking:
24+
25+
Backwards incompatible API changes
26+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27+
28+
.. _whatsnew_021.bug_fixes:
29+
30+
Bug Fixes
31+
~~~~~~~~~
32+

pandas_datareader/data.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas_datareader.yahoo.components import _get_data as get_components_yahoo
1414
from pandas_datareader.yahoo.options import Options as YahooOptions
1515

16+
from pandas_datareader.eurostat import EurostatReader
1617
from pandas_datareader.fred import FredReader
1718
from pandas_datareader.famafrench import FamaFrenchReader
1819
from pandas_datareader.oecd import OECDReader
@@ -117,6 +118,10 @@ def DataReader(name, data_source=None, start=None, end=None,
117118
return OECDReader(symbols=name, start=start, end=end,
118119
retry_count=retry_count, pause=pause,
119120
session=session).read()
121+
elif data_source == "eurostat":
122+
return EurostatReader(symbols=name, start=start, end=end,
123+
retry_count=retry_count, pause=pause,
124+
session=session).read()
120125
else:
121126
raise NotImplementedError(
122127
"data_source=%r is not implemented" % data_source)

pandas_datareader/eurostat.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from __future__ import unicode_literals
2+
3+
import pandas as pd
4+
import pandas.compat as compat
5+
6+
from pandas_datareader.io.sdmx import read_sdmx, _read_sdmx_dsd
7+
from pandas_datareader.base import _BaseReader
8+
9+
10+
class EurostatReader(_BaseReader):
11+
12+
"""Get data for the given name from Eurostat."""
13+
14+
_URL = 'http://www.ec.europa.eu/eurostat/SDMX/diss-web/rest'
15+
16+
@property
17+
def url(self):
18+
if not isinstance(self.symbols, compat.string_types):
19+
raise ValueError('data name must be string')
20+
21+
return '{0}/data/{1}/?'.format(self._URL, self.symbols)
22+
23+
@property
24+
def dsd_url(self):
25+
if not isinstance(self.symbols, compat.string_types):
26+
raise ValueError('data name must be string')
27+
28+
return '{0}/datastructure/ESTAT/DSD_{1}'.format(self._URL, self.symbols)
29+
30+
def _read_one_data(self, url, params):
31+
resp_dsd = self._get_response(self.dsd_url)
32+
dsd = _read_sdmx_dsd(resp_dsd.content)
33+
34+
resp = self._get_response(url)
35+
data = read_sdmx(resp.content, dsd=dsd)
36+
37+
try:
38+
data.index = pd.to_datetime(data.index)
39+
data = data.sort_index()
40+
data = data.truncate(self.start, self.end)
41+
except ValueError:
42+
pass
43+
return data
44+

pandas_datareader/io/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from pandas_datareader.io.jsdmx import read_jsdmx
1+
from pandas_datareader.io.jsdmx import read_jsdmx
2+
from pandas_datareader.io.sdmx import read_sdmx

pandas_datareader/io/sdmx.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
from __future__ import unicode_literals
2+
3+
import collections
4+
import os
5+
6+
import numpy as np
7+
import pandas as pd
8+
import pandas.compat as compat
9+
10+
from pandas_datareader.io.util import _read_content
11+
12+
13+
_STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}'
14+
_MESSAGE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}'
15+
_GENERIC = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}'
16+
_COMMON = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}'
17+
_XML = '{http://www.w3.org/XML/1998/namespace}'
18+
19+
_DATASET = _MESSAGE + 'DataSet'
20+
_SERIES = _GENERIC + 'Series'
21+
_SERIES_KEY = _GENERIC + 'SeriesKey'
22+
_OBSERVATION = _GENERIC + 'Obs'
23+
_VALUE = _GENERIC + 'Value'
24+
_OBSDIMENSION = _GENERIC + 'ObsDimension'
25+
_OBSVALUE = _GENERIC + 'ObsValue'
26+
_CODE = _STRUCTURE + 'Code'
27+
_TIMEDIMENSION = _STRUCTURE + 'TimeDimension'
28+
29+
30+
def read_sdmx(path_or_buf, dtype='float64', dsd=None):
31+
"""
32+
Convert a SDMX-XML string to pandas object
33+
34+
Parameters
35+
----------
36+
filepath_or_buffer : a valid SDMX-XML string or file-like
37+
https://webgate.ec.europa.eu/fpfis/mwikis/sdmx/index.php/Main_Page
38+
dtype : str
39+
dtype to coerce values
40+
dsd : dict
41+
parsed DSD dict corresponding to the SDMX-XML data
42+
43+
Returns
44+
-------
45+
results : Series, DataFrame, or dictionaly of Series or DataFrame.
46+
"""
47+
48+
xdata = _read_content(path_or_buf)
49+
50+
import xml.etree.ElementTree as ET
51+
root = ET.fromstring(xdata)
52+
53+
structure = _get_child(root, _MESSAGE + 'Structure')
54+
idx_name = structure.get('dimensionAtObservation')
55+
dataset = _get_child(root, _DATASET)
56+
57+
keys = []
58+
obss = []
59+
60+
for series in dataset.iter(_SERIES):
61+
key = _parse_series_key(series)
62+
obs = _parse_observations(series.iter(_OBSERVATION))
63+
keys.append(key)
64+
obss.append(obs)
65+
66+
mcols = _construct_index(keys, dsd=dsd)
67+
mseries = _construct_series(obss, name=idx_name, dsd=dsd)
68+
69+
df = pd.DataFrame(mseries, dtype=dtype)
70+
df = df.T
71+
df.columns = mcols
72+
73+
return df
74+
75+
76+
def _construct_series(values, name, dsd=None):
77+
78+
# ts defines attributes to be handled as times
79+
times = dsd.ts if dsd is not None else []
80+
81+
if len(values) < 1:
82+
raise ValueError("Data contains no 'Series'")
83+
results = []
84+
for value in values:
85+
86+
if name in times:
87+
idx = pd.DatetimeIndex([v[0] for v in value], name=name)
88+
else:
89+
idx = pd.Index([v[0] for v in value], name=name)
90+
91+
results.append(pd.Series([v[1] for v in value], index=idx))
92+
return results
93+
94+
95+
def _construct_index(keys, dsd=None):
96+
97+
# code defines a mapping to key's internal code to its representation
98+
codes = dsd.codes if dsd is not None else {}
99+
100+
if len(keys) < 1:
101+
raise ValueError("Data contains no 'Series'")
102+
names = [t[0] for t in keys[0]]
103+
values = {}
104+
# initialize
105+
for key in keys:
106+
for name, value in key:
107+
# apply DSD
108+
try:
109+
value = codes[name][value]
110+
except KeyError:
111+
pass
112+
113+
try:
114+
values[name].append(value)
115+
except KeyError:
116+
values[name] = [value]
117+
118+
midx = pd.MultiIndex.from_arrays([values[name] for name in names], names=names)
119+
return midx
120+
121+
122+
def _parse_observations(observations):
123+
results = []
124+
for observation in observations:
125+
obsdimension = _get_child(observation, _OBSDIMENSION)
126+
obsvalue = _get_child(observation, _OBSVALUE)
127+
results.append((obsdimension.get('value'), obsvalue.get('value')))
128+
# return list of key/value tuple, eg: [(key, value), ...]
129+
return results
130+
131+
132+
def _parse_series_key(series):
133+
serieskey = _get_child(series, _SERIES_KEY)
134+
key_values = serieskey.iter(_VALUE)
135+
keys = [(key.get('id'), key.get('value')) for key in key_values]
136+
# return list of key/value tuple, eg: [(key, value), ...]
137+
return keys
138+
139+
140+
def _get_child(element, key):
141+
elements = list(element.iter(key))
142+
if len(elements) == 1:
143+
return elements[0]
144+
elif len(elements) == 0:
145+
raise ValueError("Element {0} contains no {1}".format(element.tag, key))
146+
else:
147+
raise ValueError("Element {0} contains multiple {1}".format(element.tag, key))
148+
149+
150+
_NAME_EN = ".//{0}Name[@{1}lang='en']".format(_COMMON, _XML)
151+
152+
153+
def _get_english_name(element):
154+
name = element.find(_NAME_EN).text
155+
return name
156+
157+
158+
159+
SDMXCode = collections.namedtuple('SDMXCode', ['codes', 'ts'])
160+
161+
162+
def _read_sdmx_dsd(path_or_buf):
163+
"""
164+
Convert a SDMX-XML DSD string to mapping dictionary
165+
166+
Parameters
167+
----------
168+
filepath_or_buffer : a valid SDMX-XML DSD string or file-like
169+
https://webgate.ec.europa.eu/fpfis/mwikis/sdmx/index.php/Main_Page
170+
171+
Returns
172+
-------
173+
results : namedtuple (SDMXCode)
174+
"""
175+
176+
xdata = _read_content(path_or_buf)
177+
178+
import xml.etree.cElementTree as ET
179+
root = ET.fromstring(xdata)
180+
181+
structure = _get_child(root, _MESSAGE + 'Structures')
182+
codes = _get_child(structure, _STRUCTURE + 'Codelists')
183+
# concepts = _get_child(structure, _STRUCTURE + 'Concepts')
184+
datastructures = _get_child(structure, _STRUCTURE + 'DataStructures')
185+
186+
code_results = {}
187+
for codelist in codes:
188+
# codelist_id = codelist.get('id')
189+
codelist_name = _get_english_name(codelist)
190+
mapper = {}
191+
for code in codelist.iter(_CODE):
192+
code_id = code.get('id')
193+
name = _get_english_name(code)
194+
mapper[code_id] = name
195+
# codeobj = SDMXCode(id=codelist_id, name=codelist_name, mapper=mapper)
196+
# code_results[codelist_id] = codeobj
197+
code_results[codelist_name] = mapper
198+
199+
times = list(datastructures.iter(_TIMEDIMENSION))
200+
times = [t.get('id') for t in times]
201+
202+
result = SDMXCode(codes=code_results, ts=times)
203+
return result

0 commit comments

Comments
 (0)