@@ -3,13 +3,14 @@ jupytext:
33 text_representation :
44 extension : .md
55 format_name : myst
6+ format_version : 0.13
7+ jupytext_version : 1.14.5
68kernelspec :
7- display_name : Python 3
9+ display_name : Python 3 (ipykernel)
810 language : python
911 name : python3
1012---
1113
12-
1314# Heavy-Tailed Distributions
1415
1516``` {contents} Contents
@@ -18,24 +19,30 @@ kernelspec:
1819
1920In addition to what's in Anaconda, this lecture will need the following libraries:
2021
21- ``` {code-cell} ipython
22- ---
23- tags: [hide-output]
24- ---
22+ ``` {code-cell} ipython3
23+ :tags: [hide-output]
24+
2525!pip install quantecon
2626!pip install --upgrade yfinance
27+ !pip install pandas_datareader
2728```
29+
2830We run the following code to prepare for the lecture:
2931
30- ``` {code-cell} ipython
32+ ``` {code-cell} ipython3
3133%matplotlib inline
3234import matplotlib.pyplot as plt
3335plt.rcParams["figure.figsize"] = (11, 5) #set default figure size
3436import numpy as np
3537import quantecon as qe
36- from scipy.stats import norm
3738import yfinance as yf
3839import pandas as pd
40+ import pandas_datareader.data as web
41+ import statsmodels.api as sm
42+
43+ from interpolation import interp
44+ from pandas_datareader import wb
45+ from scipy.stats import norm, cauchy
3946from pandas.plotting import register_matplotlib_converters
4047register_matplotlib_converters()
4148```
@@ -70,7 +77,7 @@ quickly.
7077We can see this when we plot the density and show a histogram of observations,
7178as with the following code (which assumes $\mu=0$ and $\sigma=1$).
7279
73- ``` {code-cell} ipython
80+ ``` {code-cell} ipython3
7481fig, ax = plt.subplots()
7582X = norm.rvs(size=1_000_000)
7683ax.hist(X, bins=40, alpha=0.4, label='histogram', density=True)
@@ -87,13 +94,13 @@ Notice how
8794
8895We can see the last point more clearly by executing
8996
90- ``` {code-cell} ipython
97+ ``` {code-cell} ipython3
9198X.min(), X.max()
9299```
93100
94101Here's another view of draws from the same distribution:
95102
96- ``` {code-cell} python3
103+ ``` {code-cell} ipython3
97104n = 2000
98105fig, ax = plt.subplots()
99106data = norm.rvs(size=n)
@@ -153,7 +160,7 @@ This equates to daily returns if we set dividends aside.
153160
154161The code below produces the desired plot using Yahoo financial data via the ` yfinance ` library.
155162
156- ``` {code-cell} python3
163+ ``` {code-cell} ipython3
157164s = yf.download('AMZN', '2015-1-1', '2022-7-1')['Adj Close']
158165r = s.pct_change()
159166
@@ -173,7 +180,7 @@ Several of observations are quite extreme.
173180
174181We get a similar picture if we look at other assets, such as Bitcoin
175182
176- ``` {code-cell} python3
183+ ``` {code-cell} ipython3
177184s = yf.download('BTC-USD', '2015-1-1', '2022-7-1')['Adj Close']
178185r = s.pct_change()
179186
@@ -190,8 +197,7 @@ plt.show()
190197The histogram also looks different to the histogram of the normal
191198distribution:
192199
193-
194- ``` {code-cell} python3
200+ ``` {code-cell} ipython3
195201fig, ax = plt.subplots()
196202ax.hist(r, bins=60, alpha=0.4, label='bitcoin returns', density=True)
197203ax.set_xlabel('returns', fontsize=12)
@@ -293,6 +299,167 @@ TODO
293299TODO Review exercises below --- are they all too hard for undergrads or should
294300we keep some of them.
295301
302+ ``` {code-cell} ipython3
303+ def extract_wb(varlist=['NY.GDP.MKTP.CD'], c='all', s=1900, e=2021):
304+ df = wb.download(indicator=varlist, country=c, start=s, end=e).stack().unstack(0).reset_index()
305+ df = df.drop(['level_1'], axis=1).set_index(['year']).transpose()
306+ return df
307+ ```
308+
309+ ``` {code-cell} ipython3
310+ def empirical_ccdf(data,
311+ ax,
312+ aw=None, # weights
313+ label=None,
314+ xlabel=None,
315+ add_reg_line=False,
316+ title=None):
317+ """
318+ Take data vector and return prob values for plotting.
319+ Upgraded empirical_ccdf
320+ """
321+ y_vals = np.empty_like(data, dtype='float64')
322+ p_vals = np.empty_like(data, dtype='float64')
323+ n = len(data)
324+ if aw is None:
325+ for i, d in enumerate(data):
326+ # record fraction of sample above d
327+ y_vals[i] = np.sum(data >= d) / n
328+ p_vals[i] = np.sum(data == d) / n
329+ else:
330+ fw = np.empty_like(aw, dtype='float64')
331+ for i, a in enumerate(aw):
332+ fw[i] = a / np.sum(aw)
333+ pdf = lambda x: interp(data, fw, x)
334+ data = np.sort(data)
335+ j = 0
336+ for i, d in enumerate(data):
337+ j += pdf(d)
338+ y_vals[i] = 1- j
339+
340+ x, y = np.log(data), np.log(y_vals)
341+
342+ results = sm.OLS(y, sm.add_constant(x)).fit()
343+ b, a = results.params
344+
345+ kwargs = [('alpha', 0.3)]
346+ if label:
347+ kwargs.append(('label', label))
348+ kwargs = dict(kwargs)
349+
350+ ax.scatter(x, y, **kwargs)
351+ if add_reg_line:
352+ ax.plot(x, x * a + b, 'k-', alpha=0.6, label=f"slope = ${a: 1.2f}$")
353+ if not xlabel:
354+ xlabel='log value'
355+ ax.set_xlabel(xlabel, fontsize=12)
356+ ax.set_ylabel("log prob.", fontsize=12)
357+
358+ if label:
359+ ax.legend(loc='lower left', fontsize=12)
360+
361+ if title:
362+ ax.set_title(title)
363+
364+ return np.log(data), y_vals, p_vals
365+ ```
366+
367+ ### GDP
368+
369+ ``` {code-cell} ipython3
370+ df_gdp1 = extract_wb(varlist=['NY.GDP.MKTP.CD']) # gdp for all countries from 1960 to 2022
371+ df_gdp2 = extract_wb(varlist=['NY.GDP.PCAP.CD']) # gdp per capita for all countries from 1960 to 2022
372+ ```
373+
374+ ``` {code-cell} ipython3
375+ fig, axes = plt.subplots(1, 2, figsize=(8.8, 3.6))
376+
377+ empirical_ccdf(np.asarray(df_gdp1['2021'].dropna()), axes[0], add_reg_line=False, label='GDP')
378+ empirical_ccdf(np.asarray(df_gdp2['2021'].dropna()), axes[1], add_reg_line=False, label='GDP per capita')
379+
380+ plt.show()
381+ ```
382+
383+ ### Firm size
384+
385+ ``` {code-cell} ipython3
386+ df_fs = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/forbes-global2000.csv')
387+ df_fs = df_fs[['Country', 'Sales', 'Profits', 'Assets', 'Market Value']]
388+ ```
389+
390+ ``` {code-cell} ipython3
391+ fig, ax = plt.subplots(figsize=(6.4, 3.5))
392+
393+ label="firm size (market value)"
394+
395+ d = df_fs.sort_values('Market Value', ascending=False)
396+
397+ empirical_ccdf(np.asarray(d['Market Value'])[0:500], ax, label=label, add_reg_line=True)
398+
399+ plt.show()
400+ ```
401+
402+ ### City size
403+
404+ ``` {code-cell} ipython3
405+ df_cs_us = pd.read_csv('https://raw.githubusercontent.com/QuantEcon/high_dim_data/update_csdata/cross_section/cities_us.txt', delimiter="\t", header=None)
406+ df_cs_us = df_cs_us[[0, 3]]
407+ df_cs_us.columns = 'rank', 'pop'
408+ x = np.asarray(df_cs_us['pop'])
409+ citysize = []
410+ for i in x:
411+ i = i.replace(",", "")
412+ citysize.append(int(i))
413+ df_cs_us['pop'] = citysize
414+ ```
415+
416+ ``` {code-cell} ipython3
417+ df_cs_br = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/cities_brazil.csv', delimiter=",", header=None)
418+ df_cs_br.columns = df_cs_br.iloc[0]
419+ df_cs_br = df_cs_br[1:401]
420+ df_cs_br = df_cs_br.astype({"pop2023": float})
421+ ```
422+
423+ ``` {code-cell} ipython3
424+ fig, axes = plt.subplots(1, 2, figsize=(8.8, 3.6))
425+
426+
427+ empirical_ccdf(np.asarray(df_cs_us['pop']), axes[0], label="US", add_reg_line=True)
428+ empirical_ccdf(np.asarray(df_cs_br['pop2023']), axes[1], label="Brazil", add_reg_line=True)
429+
430+ plt.show()
431+ ```
432+
433+ ### Wealth
434+
435+ ``` {code-cell} ipython3
436+ df_w = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/forbes-billionaires.csv')
437+ df_w = df_w[['country', 'realTimeWorth', 'realTimeRank']].dropna()
438+ df_w = df_w.astype({'realTimeRank': int})
439+ df_w = df_w.sort_values('realTimeRank', ascending=True).copy()
440+ ```
441+
442+ ``` {code-cell} ipython3
443+ countries = ['United States', 'Japan', 'India', 'Italy']
444+ N = len(countries)
445+
446+ fig, axs = plt.subplots(2, 2, figsize=(8, 6))
447+ axs = axs.flatten()
448+
449+ for i, c in enumerate(countries):
450+ df_w_c = df_w[df_w['country'] == c].reset_index()
451+ z = np.asarray(df_w_c['realTimeWorth'])
452+ # print('number of the global richest 2000 from '+ c, len(z))
453+ if len(z) <= 500: # cut-off number: top 500
454+ z = z[0:500]
455+
456+ empirical_ccdf(z[0:500], axs[i], label=c, xlabel='log wealth', add_reg_line=True)
457+
458+ fig.tight_layout()
459+
460+ plt.show()
461+ ```
462+
296463
297464## Exercises
298465
@@ -394,7 +561,7 @@ firm dynamics in later lectures.)
394561:class: dropdown
395562```
396563
397- ``` {code-cell} python3
564+ ``` {code-cell} ipython3
398565n = 120
399566np.random.seed(11)
400567
0 commit comments