Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
2108 views
Kernel: Python 3 (old Anaconda 3)
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns
countries = pd.read_csv('countries.tsv', sep='\t', index_col=0) countries.head()
countries.info()
<class 'pandas.core.frame.DataFrame'> Index: 15 entries, Azerbaijan to Japan Data columns (total 4 columns): Average life expectancy 15 non-null object Standard of living 15 non-null object Percentage of urban population 15 non-null object Daily caloric intake 15 non-null int64 dtypes: int64(1), object(3) memory usage: 600.0+ bytes
for feature in countries.columns: countries[feature] = countries[feature].map(lambda x: str(x).replace(',', '.'))
countries.head()
for feature in countries.columns: countries[feature] = pd.to_numeric(countries[feature])
countries.info()
<class 'pandas.core.frame.DataFrame'> Index: 15 entries, Azerbaijan to Japan Data columns (total 4 columns): Average life expectancy 15 non-null float64 Standard of living 15 non-null float64 Percentage of urban population 15 non-null float64 Daily caloric intake 15 non-null int64 dtypes: float64(3), int64(1) memory usage: 600.0+ bytes
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(countries) countries_std = scaler.transform(countries)
from sklearn.decomposition import PCA pca = PCA(n_components=4) pca.fit(countries_std)
PCA(copy=True, iterated_power='auto', n_components=4, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_)
[ 0.62895814 0.23681665 0.10712595 0.02709926]
pca_two = PCA(n_components=2) pca_two.fit(countries_std) pca_two.transform(countries_std)
array([[ 0.09406675, -0.36847649], [ 0.31480433, -1.3538389 ], [-0.87043755, -0.40881141], [-0.79088132, -0.48127669], [ 0.75854797, 1.07668025], [ 0.22782872, 1.58091451], [-1.13671223, 1.36335547], [-0.02385484, 0.43854101], [-2.98560359, -0.13486692], [-1.79548505, -1.17603596], [-1.17550515, -0.21500811], [ 1.20904754, 1.59854117], [-0.04381248, -0.40575332], [ 3.18493514, -0.1172517 ], [ 3.03306177, -1.39671292]])
fig, ax = plt.subplots(figsize=(12, 12)) x = pca_two.transform(countries_std)[:, 0] y = pca_two.transform(countries_std)[:, 1] ax.scatter(x, y) for i, txt in enumerate(countries.index): ax.annotate(txt, (x[i], y[i])) plt.axis('equal');
Image in a Jupyter notebook
loadings = pca_two.components_.T * np.sqrt(pca_two.explained_variance_) loadings
array([[ 0.84051867, -0.49769362], [ 0.98831363, -0.09643122], [ 0.86616025, 0.08066102], [ 0.51192368, 0.86684737]])
sns.heatmap(loadings);
Image in a Jupyter notebook
pca_one = PCA(n_components=1) pca_one.fit(countries_std) pca_one.transform(countries_std)
array([[ 0.09406675], [ 0.31480433], [-0.87043755], [-0.79088132], [ 0.75854797], [ 0.22782872], [-1.13671223], [-0.02385484], [-2.98560359], [-1.79548505], [-1.17550515], [ 1.20904754], [-0.04381248], [ 3.18493514], [ 3.03306177]])
pca_one.explained_variance_ratio_
array([ 0.62895814])
loadings = pca_one.components_.T * np.sqrt(pca_one.explained_variance_) loadings
array([[ 0.84051867], [ 0.98831363], [ 0.86616025], [ 0.51192368]])
plt.plot(pca_one.transform(countries_norm), 'bo')
[<matplotlib.lines.Line2D at 0xd817358>]
Image in a Jupyter notebook