CoCalc -- PCA_countries.ipynb

Econometrics/Spring 2018 / Reduction / Principal Component Analysis / Countries / PCA_countries.ipynb

²¹⁰⁸ views

Kernel: Python 3 (old Anaconda 3)

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:

countries = pd.read_csv('countries.tsv', sep='\t', index_col=0)
countries.head()

Out[2]:

In [3]:

countries.info()

Out[3]:

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, Azerbaijan to Japan
Data columns (total 4 columns):
Average life expectancy           15 non-null object
Standard of living                15 non-null object
Percentage of urban population    15 non-null object
Daily caloric intake              15 non-null int64
dtypes: int64(1), object(3)
memory usage: 600.0+ bytes

In [4]:

for feature in countries.columns:
    countries[feature] = countries[feature].map(lambda x: str(x).replace(',', '.'))

In [5]:

countries.head()

Out[5]:

In [6]:

for feature in countries.columns:
    countries[feature] = pd.to_numeric(countries[feature])

In [7]:

countries.info()

Out[7]:

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, Azerbaijan to Japan
Data columns (total 4 columns):
Average life expectancy           15 non-null float64
Standard of living                15 non-null float64
Percentage of urban population    15 non-null float64
Daily caloric intake              15 non-null int64
dtypes: float64(3), int64(1)
memory usage: 600.0+ bytes

In [8]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(countries)
countries_std = scaler.transform(countries)

In [10]:

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(countries_std)

Out[10]:

PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [11]:

print(pca.explained_variance_ratio_)

Out[11]:

[ 0.62895814  0.23681665  0.10712595  0.02709926]

In [12]:

pca_two = PCA(n_components=2)
pca_two.fit(countries_std)
pca_two.transform(countries_std)

Out[12]:

array([[ 0.09406675, -0.36847649],
       [ 0.31480433, -1.3538389 ],
       [-0.87043755, -0.40881141],
       [-0.79088132, -0.48127669],
       [ 0.75854797,  1.07668025],
       [ 0.22782872,  1.58091451],
       [-1.13671223,  1.36335547],
       [-0.02385484,  0.43854101],
       [-2.98560359, -0.13486692],
       [-1.79548505, -1.17603596],
       [-1.17550515, -0.21500811],
       [ 1.20904754,  1.59854117],
       [-0.04381248, -0.40575332],
       [ 3.18493514, -0.1172517 ],
       [ 3.03306177, -1.39671292]])

In [15]:

fig, ax = plt.subplots(figsize=(12, 12))
x = pca_two.transform(countries_std)[:, 0]
y = pca_two.transform(countries_std)[:, 1]
ax.scatter(x, y)

for i, txt in enumerate(countries.index):
    ax.annotate(txt, (x[i], y[i]))
plt.axis('equal');

Out[15]:

In [16]:

loadings = pca_two.components_.T * np.sqrt(pca_two.explained_variance_)
loadings

Out[16]:

array([[ 0.84051867, -0.49769362],
       [ 0.98831363, -0.09643122],
       [ 0.86616025,  0.08066102],
       [ 0.51192368,  0.86684737]])

In [18]:

sns.heatmap(loadings);

Out[18]:

In [20]:

pca_one = PCA(n_components=1)
pca_one.fit(countries_std)
pca_one.transform(countries_std)

Out[20]:

array([[ 0.09406675],
       [ 0.31480433],
       [-0.87043755],
       [-0.79088132],
       [ 0.75854797],
       [ 0.22782872],
       [-1.13671223],
       [-0.02385484],
       [-2.98560359],
       [-1.79548505],
       [-1.17550515],
       [ 1.20904754],
       [-0.04381248],
       [ 3.18493514],
       [ 3.03306177]])

In [21]:

pca_one.explained_variance_ratio_

Out[21]:

array([ 0.62895814])

In [22]:

loadings = pca_one.components_.T * np.sqrt(pca_one.explained_variance_)
loadings

Out[22]:

array([[ 0.84051867],
       [ 0.98831363],
       [ 0.86616025],
       [ 0.51192368]])

In [60]:

plt.plot(pca_one.transform(countries_norm), 'bo')

Out[60]:

[<matplotlib.lines.Line2D at 0xd817358>]

In [0]:

Product

Resources

Company