Skip to content

Commit 131da21

Browse files
committed
embed_figure_code_w_fix
1 parent a657229 commit 131da21

File tree

1 file changed

+183
-16
lines changed

1 file changed

+183
-16
lines changed

cross_section.md

Lines changed: 183 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ jupytext:
33
text_representation:
44
extension: .md
55
format_name: myst
6+
format_version: 0.13
7+
jupytext_version: 1.14.5
68
kernelspec:
7-
display_name: Python 3
9+
display_name: Python 3 (ipykernel)
810
language: python
911
name: python3
1012
---
1113

12-
1314
# Heavy-Tailed Distributions
1415

1516
```{contents} Contents
@@ -18,24 +19,30 @@ kernelspec:
1819

1920
In addition to what's in Anaconda, this lecture will need the following libraries:
2021

21-
```{code-cell} ipython
22-
---
23-
tags: [hide-output]
24-
---
22+
```{code-cell} ipython3
23+
:tags: [hide-output]
24+
2525
!pip install quantecon
2626
!pip install --upgrade yfinance
27+
!pip install pandas_datareader
2728
```
29+
2830
We run the following code to prepare for the lecture:
2931

30-
```{code-cell} ipython
32+
```{code-cell} ipython3
3133
%matplotlib inline
3234
import matplotlib.pyplot as plt
3335
plt.rcParams["figure.figsize"] = (11, 5) #set default figure size
3436
import numpy as np
3537
import quantecon as qe
36-
from scipy.stats import norm
3738
import yfinance as yf
3839
import pandas as pd
40+
import pandas_datareader.data as web
41+
import statsmodels.api as sm
42+
43+
from interpolation import interp
44+
from pandas_datareader import wb
45+
from scipy.stats import norm, cauchy
3946
from pandas.plotting import register_matplotlib_converters
4047
register_matplotlib_converters()
4148
```
@@ -70,7 +77,7 @@ quickly.
7077
We can see this when we plot the density and show a histogram of observations,
7178
as with the following code (which assumes $\mu=0$ and $\sigma=1$).
7279

73-
```{code-cell} ipython
80+
```{code-cell} ipython3
7481
fig, ax = plt.subplots()
7582
X = norm.rvs(size=1_000_000)
7683
ax.hist(X, bins=40, alpha=0.4, label='histogram', density=True)
@@ -87,13 +94,13 @@ Notice how
8794

8895
We can see the last point more clearly by executing
8996

90-
```{code-cell} ipython
97+
```{code-cell} ipython3
9198
X.min(), X.max()
9299
```
93100

94101
Here's another view of draws from the same distribution:
95102

96-
```{code-cell} python3
103+
```{code-cell} ipython3
97104
n = 2000
98105
fig, ax = plt.subplots()
99106
data = norm.rvs(size=n)
@@ -153,7 +160,7 @@ This equates to daily returns if we set dividends aside.
153160

154161
The code below produces the desired plot using Yahoo financial data via the `yfinance` library.
155162

156-
```{code-cell} python3
163+
```{code-cell} ipython3
157164
s = yf.download('AMZN', '2015-1-1', '2022-7-1')['Adj Close']
158165
r = s.pct_change()
159166
@@ -173,7 +180,7 @@ Several of observations are quite extreme.
173180

174181
We get a similar picture if we look at other assets, such as Bitcoin
175182

176-
```{code-cell} python3
183+
```{code-cell} ipython3
177184
s = yf.download('BTC-USD', '2015-1-1', '2022-7-1')['Adj Close']
178185
r = s.pct_change()
179186
@@ -190,8 +197,7 @@ plt.show()
190197
The histogram also looks different to the histogram of the normal
191198
distribution:
192199

193-
194-
```{code-cell} python3
200+
```{code-cell} ipython3
195201
fig, ax = plt.subplots()
196202
ax.hist(r, bins=60, alpha=0.4, label='bitcoin returns', density=True)
197203
ax.set_xlabel('returns', fontsize=12)
@@ -293,6 +299,167 @@ TODO
293299
TODO Review exercises below --- are they all too hard for undergrads or should
294300
we keep some of them.
295301

302+
```{code-cell} ipython3
303+
def extract_wb(varlist=['NY.GDP.MKTP.CD'], c='all', s=1900, e=2021):
304+
df = wb.download(indicator=varlist, country=c, start=s, end=e).stack().unstack(0).reset_index()
305+
df = df.drop(['level_1'], axis=1).set_index(['year']).transpose()
306+
return df
307+
```
308+
309+
```{code-cell} ipython3
310+
def empirical_ccdf(data,
311+
ax,
312+
aw=None, # weights
313+
label=None,
314+
xlabel=None,
315+
add_reg_line=False,
316+
title=None):
317+
"""
318+
Take data vector and return prob values for plotting.
319+
Upgraded empirical_ccdf
320+
"""
321+
y_vals = np.empty_like(data, dtype='float64')
322+
p_vals = np.empty_like(data, dtype='float64')
323+
n = len(data)
324+
if aw is None:
325+
for i, d in enumerate(data):
326+
# record fraction of sample above d
327+
y_vals[i] = np.sum(data >= d) / n
328+
p_vals[i] = np.sum(data == d) / n
329+
else:
330+
fw = np.empty_like(aw, dtype='float64')
331+
for i, a in enumerate(aw):
332+
fw[i] = a / np.sum(aw)
333+
pdf = lambda x: interp(data, fw, x)
334+
data = np.sort(data)
335+
j = 0
336+
for i, d in enumerate(data):
337+
j += pdf(d)
338+
y_vals[i] = 1- j
339+
340+
x, y = np.log(data), np.log(y_vals)
341+
342+
results = sm.OLS(y, sm.add_constant(x)).fit()
343+
b, a = results.params
344+
345+
kwargs = [('alpha', 0.3)]
346+
if label:
347+
kwargs.append(('label', label))
348+
kwargs = dict(kwargs)
349+
350+
ax.scatter(x, y, **kwargs)
351+
if add_reg_line:
352+
ax.plot(x, x * a + b, 'k-', alpha=0.6, label=f"slope = ${a: 1.2f}$")
353+
if not xlabel:
354+
xlabel='log value'
355+
ax.set_xlabel(xlabel, fontsize=12)
356+
ax.set_ylabel("log prob.", fontsize=12)
357+
358+
if label:
359+
ax.legend(loc='lower left', fontsize=12)
360+
361+
if title:
362+
ax.set_title(title)
363+
364+
return np.log(data), y_vals, p_vals
365+
```
366+
367+
### GDP
368+
369+
```{code-cell} ipython3
370+
df_gdp1 = extract_wb(varlist=['NY.GDP.MKTP.CD']) # gdp for all countries from 1960 to 2022
371+
df_gdp2 = extract_wb(varlist=['NY.GDP.PCAP.CD']) # gdp per capita for all countries from 1960 to 2022
372+
```
373+
374+
```{code-cell} ipython3
375+
fig, axes = plt.subplots(1, 2, figsize=(8.8, 3.6))
376+
377+
empirical_ccdf(np.asarray(df_gdp1['2021'].dropna()), axes[0], add_reg_line=False, label='GDP')
378+
empirical_ccdf(np.asarray(df_gdp2['2021'].dropna()), axes[1], add_reg_line=False, label='GDP per capita')
379+
380+
plt.show()
381+
```
382+
383+
### Firm size
384+
385+
```{code-cell} ipython3
386+
df_fs = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/forbes-global2000.csv')
387+
df_fs = df_fs[['Country', 'Sales', 'Profits', 'Assets', 'Market Value']]
388+
```
389+
390+
```{code-cell} ipython3
391+
fig, ax = plt.subplots(figsize=(6.4, 3.5))
392+
393+
label="firm size (market value)"
394+
395+
d = df_fs.sort_values('Market Value', ascending=False)
396+
397+
empirical_ccdf(np.asarray(d['Market Value'])[0:500], ax, label=label, add_reg_line=True)
398+
399+
plt.show()
400+
```
401+
402+
### City size
403+
404+
```{code-cell} ipython3
405+
df_cs_us = pd.read_csv('https://raw.githubusercontent.com/QuantEcon/high_dim_data/update_csdata/cross_section/cities_us.txt', delimiter="\t", header=None)
406+
df_cs_us = df_cs_us[[0, 3]]
407+
df_cs_us.columns = 'rank', 'pop'
408+
x = np.asarray(df_cs_us['pop'])
409+
citysize = []
410+
for i in x:
411+
i = i.replace(",", "")
412+
citysize.append(int(i))
413+
df_cs_us['pop'] = citysize
414+
```
415+
416+
```{code-cell} ipython3
417+
df_cs_br = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/cities_brazil.csv', delimiter=",", header=None)
418+
df_cs_br.columns = df_cs_br.iloc[0]
419+
df_cs_br = df_cs_br[1:401]
420+
df_cs_br = df_cs_br.astype({"pop2023": float})
421+
```
422+
423+
```{code-cell} ipython3
424+
fig, axes = plt.subplots(1, 2, figsize=(8.8, 3.6))
425+
426+
427+
empirical_ccdf(np.asarray(df_cs_us['pop']), axes[0], label="US", add_reg_line=True)
428+
empirical_ccdf(np.asarray(df_cs_br['pop2023']), axes[1], label="Brazil", add_reg_line=True)
429+
430+
plt.show()
431+
```
432+
433+
### Wealth
434+
435+
```{code-cell} ipython3
436+
df_w = pd.read_csv('https://media.githubusercontent.com/media/QuantEcon/high_dim_data/update_csdata/cross_section/forbes-billionaires.csv')
437+
df_w = df_w[['country', 'realTimeWorth', 'realTimeRank']].dropna()
438+
df_w = df_w.astype({'realTimeRank': int})
439+
df_w = df_w.sort_values('realTimeRank', ascending=True).copy()
440+
```
441+
442+
```{code-cell} ipython3
443+
countries = ['United States', 'Japan', 'India', 'Italy']
444+
N = len(countries)
445+
446+
fig, axs = plt.subplots(2, 2, figsize=(8, 6))
447+
axs = axs.flatten()
448+
449+
for i, c in enumerate(countries):
450+
df_w_c = df_w[df_w['country'] == c].reset_index()
451+
z = np.asarray(df_w_c['realTimeWorth'])
452+
# print('number of the global richest 2000 from '+ c, len(z))
453+
if len(z) <= 500: # cut-off number: top 500
454+
z = z[0:500]
455+
456+
empirical_ccdf(z[0:500], axs[i], label=c, xlabel='log wealth', add_reg_line=True)
457+
458+
fig.tight_layout()
459+
460+
plt.show()
461+
```
462+
296463

297464
## Exercises
298465

@@ -394,7 +561,7 @@ firm dynamics in later lectures.)
394561
:class: dropdown
395562
```
396563

397-
```{code-cell} python3
564+
```{code-cell} ipython3
398565
n = 120
399566
np.random.seed(11)
400567

0 commit comments

Comments
 (0)