Path: blob/master/clean/06-why-you-should-use-a-framework.ipynb
807 views
Kernel: python3
Introduction and set up
In [ ]:
from pathlib import Path import os iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '') if iskaggle: path = Path('../input/titanic') !pip install -Uqq fastai else: import zipfile,kaggle path = Path('titanic') kaggle.api.competition_download_cli(str(path)) zipfile.ZipFile(f'{path}.zip').extractall(path)
In [ ]:
from fastai.tabular.all import * pd.options.display.float_format = '{:.2f}'.format set_seed(42)
Prep the data
In [ ]:
df = pd.read_csv(path/'train.csv')
In [ ]:
def add_features(df): df['LogFare'] = np.log1p(df['Fare']) df['Deck'] = df.Cabin.str[0].map(dict(A="ABC", B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG")) df['Family'] = df.SibSp+df.Parch df['Alone'] = df.Family==1 df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count') df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0] df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master")).value_counts(dropna=False) add_features(df)
In [ ]:
splits = RandomSplitter(seed=42)(df)
In [ ]:
dls = TabularPandas( df, splits=splits, procs = [Categorify, FillMissing, Normalize], cat_names=["Sex","Pclass","Embarked","Deck", "Title"], cont_names=['Age', 'SibSp', 'Parch', 'LogFare', 'Alone', 'TicketFreq', 'Family'], y_names="Survived", y_block = CategoryBlock(), ).dataloaders(path=".")
Train the model
In [ ]:
learn = tabular_learner(dls, metrics=accuracy, layers=[10,10])
In [ ]:
learn.lr_find(suggest_funcs=(slide, valley))
In [ ]:
learn.fit(16, lr=0.03)
Submit to Kaggle
In [ ]:
tst_df = pd.read_csv(path/'test.csv') tst_df['Fare'] = tst_df.Fare.fillna(0) add_features(tst_df)
In [ ]:
tst_dl = learn.dls.test_dl(tst_df)
In [ ]:
preds,_ = learn.get_preds(dl=tst_dl)
In [ ]:
tst_df['Survived'] = (preds[:,1]>0.5).int() sub_df = tst_df[['PassengerId','Survived']] sub_df.to_csv('sub.csv', index=False)
In [ ]:
!head sub.csv
Ensembling
In [ ]:
def ensemble(): learn = tabular_learner(dls, metrics=accuracy, layers=[10,10]) with learn.no_bar(),learn.no_logging(): learn.fit(16, lr=0.03) return learn.get_preds(dl=tst_dl)[0]
In [ ]:
learns = [ensemble() for _ in range(5)]
In [ ]:
ens_preds = torch.stack(learns).mean(0)
In [ ]:
tst_df['Survived'] = (ens_preds[:,1]>0.5).int() sub_df = tst_df[['PassengerId','Survived']] sub_df.to_csv('ens_sub.csv', index=False)