@@ -16,10 +16,30 @@ def gapminder(
1616
1717 https://www.gapminder.org/data/
1818
19- Returns:
20- A `pandas.DataFrame` with 1704 rows and the following columns:
19+ Parameters
20+ ----------
21+ datetimes: bool
22+ Whether or not 'year' column will converted to datetime type
23+
24+ centroids: bool
25+ If True, ['centroid_lat', 'centroid_lon'] columns are added
26+
27+ year: int | None
28+ If provided, the dataset will be filtered for that year
29+
30+ pretty_names: bool
31+ If True, prettifies the column names
32+
33+ return_type: {'pandas', 'polars', 'pyarrow'}
34+ Type of the resulting dataframe
35+
36+ Returns
37+ -------
38+ Dataframe of `return_type` type
39+ Dataframe with 1704 rows and the following columns:
2140 `['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap',
2241 'iso_alpha', 'iso_num']`.
42+
2343 If `datetimes` is True, the 'year' column will be a datetime column
2444 If `centroids` is True, two new columns are added: ['centroid_lat', 'centroid_lon']
2545 If `year` is an integer, the dataset will be filtered for that year
@@ -61,9 +81,20 @@ def tips(pretty_names=False, return_type="pandas"):
6181
6282 https://vincentarelbundock.github.io/Rdatasets/doc/reshape2/tips.html
6383
64- Returns:
65- A `pandas.DataFrame` with 244 rows and the following columns:
66- `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`."""
84+ Parameters
85+ ----------
86+ pretty_names: bool
87+ If True, prettifies the column names
88+
89+ return_type: {'pandas', 'polars', 'pyarrow'}
90+ Type of the resulting dataframe
91+
92+ Returns
93+ -------
94+ Dataframe of `return_type` type
95+ Dataframe with 244 rows and the following columns:
96+ `['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']`.
97+ """
6798
6899 df = nw .from_native (_get_dataset ("tips" , return_type = return_type ), eager_only = True )
69100 if pretty_names :
@@ -87,19 +118,35 @@ def iris(return_type="pandas"):
87118
88119 https://en.wikipedia.org/wiki/Iris_flower_data_set
89120
90- Returns:
91- A `pandas.DataFrame` with 150 rows and the following columns:
92- `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`."""
121+ Parameters
122+ ----------
123+ return_type: {'pandas', 'polars', 'pyarrow'}
124+ Type of the resulting dataframe
125+
126+ Returns
127+ -------
128+ Dataframe of `return_type` type
129+ Dataframe with 150 rows and the following columns:
130+ `['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'species_id']`.
131+ """
93132 return _get_dataset ("iris" , return_type = return_type )
94133
95134
96135def wind (return_type = "pandas" ):
97136 """
98137 Each row represents a level of wind intensity in a cardinal direction, and its frequency.
99138
100- Returns:
101- A `pandas.DataFrame` with 128 rows and the following columns:
102- `['direction', 'strength', 'frequency']`."""
139+ Parameters
140+ ----------
141+ return_type: {'pandas', 'polars', 'pyarrow'}
142+ Type of the resulting dataframe
143+
144+ Returns
145+ -------
146+ Dataframe of `return_type` type
147+ Dataframe with 128 rows and the following columns:
148+ `['direction', 'strength', 'frequency']`.
149+ """
103150 return _get_dataset ("wind" , return_type = return_type )
104151
105152
@@ -108,20 +155,30 @@ def election(return_type="pandas"):
108155 Each row represents voting results for an electoral district in the 2013 Montreal
109156 mayoral election.
110157
111- Returns:
112- A `pandas.DataFrame` with 58 rows and the following columns:
113- `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`."""
158+ Parameters
159+ ----------
160+ return_type: {'pandas', 'polars', 'pyarrow'}
161+ Type of the resulting dataframe
162+
163+ Returns
164+ -------
165+ Dataframe of `return_type` type
166+ Dataframe with 58 rows and the following columns:
167+ `['district', 'Coderre', 'Bergeron', 'Joly', 'total', 'winner', 'result', 'district_id']`.
168+ """
114169 return _get_dataset ("election" , return_type = return_type )
115170
116171
117172def election_geojson ():
118173 """
119174 Each feature represents an electoral district in the 2013 Montreal mayoral election.
120175
121- Returns:
176+ Returns
177+ -------
122178 A GeoJSON-formatted `dict` with 58 polygon or multi-polygon features whose `id`
123179 is an electoral district numerical ID and whose `district` property is the ID and
124- district name."""
180+ district name.
181+ """
125182 import gzip
126183 import json
127184 import os
@@ -142,22 +199,45 @@ def carshare(return_type="pandas"):
142199 Each row represents the availability of car-sharing services near the centroid of a zone
143200 in Montreal over a month-long period.
144201
145- Returns:
146- A `pandas.DataFrame` with 249 rows and the following columns:
147- `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`."""
202+ Parameters
203+ ----------
204+ return_type: {'pandas', 'polars', 'pyarrow'}
205+ Type of the resulting dataframe
206+
207+ Returns
208+ -------
209+ Dataframe of `return_type` type
210+ Dataframe` with 249 rows and the following columns:
211+ `['centroid_lat', 'centroid_lon', 'car_hours', 'peak_hour']`.
212+ """
148213 return _get_dataset ("carshare" , return_type = return_type )
149214
150215
151216def stocks (indexed = False , datetimes = False , return_type = "pandas" ):
152217 """
153218 Each row in this wide dataset represents closing prices from 6 tech stocks in 2018/2019.
154219
155- Returns:
156- A `pandas.DataFrame` with 100 rows and the following columns:
220+ Parameters
221+ ----------
222+ indexed: bool
223+ Whether or not the 'date' column is used as the index and the column index
224+ is named 'company'. Applicable only if `return_type='pandas'`
225+
226+ datetimes: bool
227+ Whether or not the 'date' column will be of datetime type
228+
229+ return_type: {'pandas', 'polars', 'pyarrow'}
230+ Type of the resulting dataframe
231+
232+ Returns
233+ -------
234+ Dataframe of `return_type` type
235+ Dataframe with 100 rows and the following columns:
157236 `['date', 'GOOG', 'AAPL', 'AMZN', 'FB', 'NFLX', 'MSFT']`.
158237 If `indexed` is True, the 'date' column is used as the index and the column index
238+ is named 'company'
159239 If `datetimes` is True, the 'date' column will be a datetime column
160- is named 'company' """
240+ """
161241 if indexed and return_type != "pandas" :
162242 msg = "Cannot set index for backend different from pandas"
163243 raise NotImplementedError (msg )
@@ -181,11 +261,22 @@ def experiment(indexed=False, return_type="pandas"):
181261 Each row in this wide dataset represents the results of 100 simulated participants
182262 on three hypothetical experiments, along with their gender and control/treatment group.
183263
264+ Parameters
265+ ----------
266+ indexed: bool
267+ If True, then the index is named "participant".
268+ Applicable only if `return_type='pandas'`
269+
270+ return_type: {'pandas', 'polars', 'pyarrow'}
271+ Type of the resulting dataframe
184272
185- Returns:
186- A `pandas.DataFrame` with 100 rows and the following columns:
273+ Returns
274+ -------
275+ Dataframe of `return_type` type
276+ Dataframe with 100 rows and the following columns:
187277 `['experiment_1', 'experiment_2', 'experiment_3', 'gender', 'group']`.
188- If `indexed` is True, the data frame index is named "participant" """
278+ If `indexed` is True, the data frame index is named "participant"
279+ """
189280
190281 if indexed and return_type != "pandas" :
191282 msg = "Cannot set index for backend different from pandas"
@@ -206,11 +297,23 @@ def medals_wide(indexed=False, return_type="pandas"):
206297 This dataset represents the medal table for Olympic Short Track Speed Skating for the
207298 top three nations as of 2020.
208299
209- Returns:
210- A `pandas.DataFrame` with 3 rows and the following columns:
300+ Parameters
301+ ----------
302+ indexed: bool
303+ Whether or not the 'nation' column is used as the index and the column index
304+ is named 'medal'. Applicable only if `return_type='pandas'`
305+
306+ return_type: {'pandas', 'polars', 'pyarrow'}
307+ Type of the resulting dataframe
308+
309+ Returns
310+ -------
311+ Dataframe of `return_type` type
312+ Dataframe with 3 rows and the following columns:
211313 `['nation', 'gold', 'silver', 'bronze']`.
212314 If `indexed` is True, the 'nation' column is used as the index and the column index
213- is named 'medal'"""
315+ is named 'medal'
316+ """
214317
215318 if indexed and return_type != "pandas" :
216319 msg = "Cannot set index for backend different from pandas"
@@ -231,10 +334,21 @@ def medals_long(indexed=False, return_type="pandas"):
231334 This dataset represents the medal table for Olympic Short Track Speed Skating for the
232335 top three nations as of 2020.
233336
234- Returns:
235- A `pandas.DataFrame` with 9 rows and the following columns:
236- `['nation', 'medal', 'count']`.
237- If `indexed` is True, the 'nation' column is used as the index."""
337+ Parameters
338+ ----------
339+ indexed: bool
340+ Whether or not the 'nation' column is used as the index.
341+ Applicable only if `return_type='pandas'`
342+
343+ return_type: {'pandas', 'polars', 'pyarrow'}
344+ Type of the resulting dataframe
345+
346+ Returns
347+ -------
348+ Dataframe of `return_type` type
349+ Dataframe with 9 rows and the following columns: `['nation', 'medal', 'count']`.
350+ If `indexed` is True, the 'nation' column is used as the index.
351+ """
238352
239353 if indexed and return_type != "pandas" :
240354 msg = "Cannot set index for backend different from pandas"
@@ -253,6 +367,25 @@ def medals_long(indexed=False, return_type="pandas"):
253367
254368
255369def _get_dataset (d , return_type ):
370+ """
371+ Loads the dataset using the specified backend.
372+
373+ Notice that the available backends are 'pandas', 'polars', 'pyarrow' and they all
374+ have a `read_csv` function. Therefore we can dynamically load the library via
375+ `importlib.import_module` and then call `backend.read_csv(filepath)`.
376+
377+ Parameters
378+ ----------
379+ d: str
380+ Name of the dataset to load.
381+
382+ return_type: {'pandas', 'polars', 'pyarrow'}
383+ Type of the resulting dataframe
384+
385+ Returns
386+ -------
387+ Dataframe of `return_type` type
388+ """
256389 import os
257390 from importlib import import_module
258391
0 commit comments