Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132958 views
License: OTHER
1
""" Writing my first randomforest code.
2
Author : AstroDave
3
Date : 23rd September 2012
4
Revised: 15 April 2014
5
please see packages.python.org/milk/randomforests.html for more
6
7
"""
8
import pandas as pd
9
import numpy as np
10
import csv as csv
11
from sklearn.ensemble import RandomForestClassifier
12
13
# Data cleanup
14
# TRAIN DATA
15
train_df = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe
16
17
# I need to convert all strings to integer classifiers.
18
# I need to fill in the missing values of the data and make it complete.
19
20
# female = 0, Male = 1
21
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
22
23
# Embarked from 'C', 'Q', 'S'
24
# Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.
25
26
# All missing Embarked -> just make them embark from most common place
27
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
28
train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values
29
30
Ports = list(enumerate(np.unique(train_df['Embarked']))) # determine all values of Embarked,
31
Ports_dict = { name : i for i, name in Ports } # set up a dictionary in the form Ports : index
32
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) # Convert all Embark strings to int
33
34
# All the ages with no data -> make the median of all Ages
35
median_age = train_df['Age'].dropna().median()
36
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
37
train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age
38
39
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
40
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
41
42
43
# TEST DATA
44
test_df = pd.read_csv('test.csv', header=0) # Load the test file into a dataframe
45
46
# I need to do the same with the test data now, so that the columns are the same as the training data
47
# I need to convert all strings to integer classifiers:
48
# female = 0, Male = 1
49
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
50
51
# Embarked from 'C', 'Q', 'S'
52
# All missing Embarked -> just make them embark from most common place
53
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
54
test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
55
# Again convert all Embarked strings to int
56
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)
57
58
59
# All the ages with no data -> make the median of all Ages
60
median_age = test_df['Age'].dropna().median()
61
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
62
test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age
63
64
# All the missing Fares -> assume median of their respective class
65
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
66
median_fare = np.zeros(3)
67
for f in range(0,3): # loop 0 to 2
68
median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
69
for f in range(0,3): # loop 0 to 2
70
test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
71
72
# Collect the test data's PassengerIds before dropping it
73
ids = test_df['PassengerId'].values
74
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
75
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
76
77
78
# The data is now ready to go. So lets fit to the train, then predict to the test!
79
# Convert back to a numpy array
80
train_data = train_df.values
81
test_data = test_df.values
82
83
84
print 'Training...'
85
forest = RandomForestClassifier(n_estimators=100)
86
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87
88
print 'Predicting...'
89
output = forest.predict(test_data).astype(int)
90
91
92
predictions_file = open("myfirstforest.csv", "wb")
93
open_file_object = csv.writer(predictions_file)
94
open_file_object.writerow(["PassengerId","Survived"])
95
open_file_object.writerows(zip(ids, output))
96
predictions_file.close()
97
print 'Done.'
98
99