CoCalc -- myfirstforest.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / data / titanic / myfirstforest.py
¹³²⁹⁵⁸ views
License: OTHER
1
""" Writing my first randomforest code.
2
Author : AstroDave
3
Date : 23rd September 2012
4
Revised: 15 April 2014
5
please see packages.python.org/milk/randomforests.html for more
6

7
""" 
8
import pandas as pd
9
import numpy as np
10
import csv as csv
11
from sklearn.ensemble import RandomForestClassifier
12

13
# Data cleanup
14
# TRAIN DATA
15
train_df = pd.read_csv('train.csv', header=0)        # Load the train file into a dataframe
16

17
# I need to convert all strings to integer classifiers.
18
# I need to fill in the missing values of the data and make it complete.
19

20
# female = 0, Male = 1
21
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
22

23
# Embarked from 'C', 'Q', 'S'
24
# Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.
25

26
# All missing Embarked -> just make them embark from most common place
27
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
28
    train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values
29

30
Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
31
Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
32
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int
33

34
# All the ages with no data -> make the median of all Ages
35
median_age = train_df['Age'].dropna().median()
36
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
37
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age
38

39
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
40
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
41

42

43
# TEST DATA
44
test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe
45

46
# I need to do the same with the test data now, so that the columns are the same as the training data
47
# I need to convert all strings to integer classifiers:
48
# female = 0, Male = 1
49
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
50

51
# Embarked from 'C', 'Q', 'S'
52
# All missing Embarked -> just make them embark from most common place
53
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
54
    test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
55
# Again convert all Embarked strings to int
56
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)
57

58

59
# All the ages with no data -> make the median of all Ages
60
median_age = test_df['Age'].dropna().median()
61
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
62
    test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age
63

64
# All the missing Fares -> assume median of their respective class
65
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
66
    median_fare = np.zeros(3)
67
    for f in range(0,3):                                              # loop 0 to 2
68
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
69
    for f in range(0,3):                                              # loop 0 to 2
70
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
71

72
# Collect the test data's PassengerIds before dropping it
73
ids = test_df['PassengerId'].values
74
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
75
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 
76

77

78
# The data is now ready to go. So lets fit to the train, then predict to the test!
79
# Convert back to a numpy array
80
train_data = train_df.values
81
test_data = test_df.values
82

83

84
print 'Training...'
85
forest = RandomForestClassifier(n_estimators=100)
86
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87

88
print 'Predicting...'
89
output = forest.predict(test_data).astype(int)
90

91

92
predictions_file = open("myfirstforest.csv", "wb")
93
open_file_object = csv.writer(predictions_file)
94
open_file_object.writerow(["PassengerId","Survived"])
95
open_file_object.writerows(zip(ids, output))
96
predictions_file.close()
97
print 'Done.'
98

99
Product

Resources

Company