Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
fastai
GitHub Repository: fastai/course22
Path: blob/master/clean/07-how-random-forests-really-work.ipynb
807 views
Kernel: python3

Introduction

from fastai.imports import * np.set_printoptions(linewidth=130)

Data preprocessing

import os iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '') if iskaggle: path = Path('../input/titanic') else: import zipfile,kaggle path = Path('titanic') kaggle.api.competition_download_cli(str(path)) zipfile.ZipFile(f'{path}.zip').extractall(path) df = pd.read_csv(path/'train.csv') tst_df = pd.read_csv(path/'test.csv') modes = df.mode().iloc[0]
def proc_data(df): df['Fare'] = df.Fare.fillna(0) df.fillna(modes, inplace=True) df['LogFare'] = np.log1p(df['Fare']) df['Embarked'] = pd.Categorical(df.Embarked) df['Sex'] = pd.Categorical(df.Sex) proc_data(df) proc_data(tst_df)
cats=["Sex","Embarked"] conts=['Age', 'SibSp', 'Parch', 'LogFare',"Pclass"] dep="Survived"
df.Sex.head()
df.Sex.cat.codes.head()

Binary splits

import seaborn as sns fig,axs = plt.subplots(1,2, figsize=(11,5)) sns.barplot(data=df, y=dep, x="Sex", ax=axs[0]).set(title="Survival rate") sns.countplot(data=df, x="Sex", ax=axs[1]).set(title="Histogram");
from numpy import random from sklearn.model_selection import train_test_split random.seed(42) trn_df,val_df = train_test_split(df, test_size=0.25) trn_df[cats] = trn_df[cats].apply(lambda x: x.cat.codes) val_df[cats] = val_df[cats].apply(lambda x: x.cat.codes)
def xs_y(df): xs = df[cats+conts].copy() return xs,df[dep] if dep in df else None trn_xs,trn_y = xs_y(trn_df) val_xs,val_y = xs_y(val_df)
preds = val_xs.Sex==0
from sklearn.metrics import mean_absolute_error mean_absolute_error(val_y, preds)
df_fare = trn_df[trn_df.LogFare>0] fig,axs = plt.subplots(1,2, figsize=(11,5)) sns.boxenplot(data=df_fare, x=dep, y="LogFare", ax=axs[0]) sns.kdeplot(data=df_fare, x="LogFare", ax=axs[1]);
preds = val_xs.LogFare>2.7
mean_absolute_error(val_y, preds)
def _side_score(side, y): tot = side.sum() if tot<=1: return 0 return y[side].std()*tot
def score(col, y, split): lhs = col<=split return (_side_score(lhs,y) + _side_score(~lhs,y))/len(y)
score(trn_xs["Sex"], trn_y, 0.5)
score(trn_xs["LogFare"], trn_y, 2.7)
def iscore(nm, split): col = trn_xs[nm] return score(col, trn_y, split) from ipywidgets import interact interact(nm=conts, split=15.5)(iscore);
interact(nm=cats, split=2)(iscore);
nm = "Age" col = trn_xs[nm] unq = col.unique() unq.sort() unq
scores = np.array([score(col, trn_y, o) for o in unq if not np.isnan(o)]) unq[scores.argmin()]
def min_col(df, nm): col,y = df[nm],df[dep] unq = col.dropna().unique() scores = np.array([score(col, y, o) for o in unq if not np.isnan(o)]) idx = scores.argmin() return unq[idx],scores[idx] min_col(trn_df, "Age")
cols = cats+conts {o:min_col(trn_df, o) for o in cols}

Creating a decision tree

cols.remove("Sex") ismale = trn_df.Sex==1 males,females = trn_df[ismale],trn_df[~ismale]
{o:min_col(males, o) for o in cols}
{o:min_col(females, o) for o in cols}
from sklearn.tree import DecisionTreeClassifier, export_graphviz m = DecisionTreeClassifier(max_leaf_nodes=4).fit(trn_xs, trn_y);
import graphviz def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs): s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True, special_characters=True, rotate=False, precision=precision, **kwargs) return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))
draw_tree(m, trn_xs, size=10)
def gini(cond): act = df.loc[cond, dep] return 1 - act.mean()**2 - (1-act).mean()**2
gini(df.Sex=='female'), gini(df.Sex=='male')
mean_absolute_error(val_y, m.predict(val_xs))
m = DecisionTreeClassifier(min_samples_leaf=50) m.fit(trn_xs, trn_y) draw_tree(m, trn_xs, size=25)
mean_absolute_error(val_y, m.predict(val_xs))
tst_df[cats] = tst_df[cats].apply(lambda x: x.cat.codes) tst_xs,_ = xs_y(tst_df) def subm(preds, suff): tst_df['Survived'] = preds sub_df = tst_df[['PassengerId','Survived']] sub_df.to_csv(f'sub-{suff}.csv', index=False) subm(m.predict(tst_xs), 'tree')
df.Embarked.head()
df.Embarked.cat.codes.head()

The random forest

def get_tree(prop=0.75): n = len(trn_y) idxs = random.choice(n, int(n*prop)) return DecisionTreeClassifier(min_samples_leaf=5).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])
trees = [get_tree() for t in range(100)]
all_probs = [t.predict(val_xs) for t in trees] avg_probs = np.stack(all_probs).mean(0) mean_absolute_error(val_y, avg_probs)
from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(100, min_samples_leaf=5) rf.fit(trn_xs, trn_y); mean_absolute_error(val_y, rf.predict(val_xs))
subm(rf.predict(tst_xs), 'rf')
pd.DataFrame(dict(cols=trn_xs.columns, imp=m.feature_importances_)).plot('cols', 'imp', 'barh');

Conclusion