CoCalc -- gendermodel.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / data / titanic / gendermodel.py
²⁰¹⁸⁵⁷ views
License: OTHER
1
""" This simple code is desinged to teach a basic user to read in the files in python, simply find what proportion of males and females survived and make a predictive model based on this
2
Author : AstroDave
3
Date : 18 September 2012
4
Revised: 28 March 2014
5

6
"""
7

8

9
import csv as csv
10
import numpy as np
11

12
csv_file_object = csv.reader(open('train.csv', 'rb')) 	# Load in the csv file
13
header = csv_file_object.next() 						# Skip the fist line as it is a header
14
data=[] 												# Create a variable to hold the data
15

16
for row in csv_file_object: 							# Skip through each row in the csv file,
17
    data.append(row[0:]) 								# adding each row to the data variable
18
data = np.array(data) 									# Then convert from a list to an array.
19

20
# Now I have an array of 12 columns and 891 rows
21
# I can access any element I want, so the entire first column would
22
# be data[0::,0].astype(np.float) -- This means all of the rows (from start to end), in column 0
23
# I have to add the .astype() command, because
24
# when appending the rows, python thought it was a string - so needed to convert
25

26
# Set some variables
27
number_passengers = np.size(data[0::,1].astype(np.float))
28
number_survived = np.sum(data[0::,1].astype(np.float))
29
proportion_survivors = number_survived / number_passengers 
30

31
# I can now find the stats of all the women on board,
32
# by making an array that lists True/False whether each row is female
33
women_only_stats = data[0::,4] == "female" 	# This finds where all the women are
34
men_only_stats = data[0::,4] != "female" 	# This finds where all the men are (note != means 'not equal')
35

36
# I can now filter the whole data, to find statistics for just women, by just placing
37
# women_only_stats as a "mask" on my full data -- Use it in place of the '0::' part of the array index. 
38
# You can test it by placing it there, and requesting column index [4], and the output should all read 'female'
39
# e.g. try typing this:   data[women_only_stats,4]
40
women_onboard = data[women_only_stats,1].astype(np.float)
41
men_onboard = data[men_only_stats,1].astype(np.float)
42

43
# and derive some statistics about them
44
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
45
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)
46

47
print 'Proportion of women who survived is %s' % proportion_women_survived
48
print 'Proportion of men who survived is %s' % proportion_men_survived
49

50
# Now that I have my indicator that women were much more likely to survive,
51
# I am done with the training set.
52
# Now I will read in the test file and write out my simplistic prediction:
53
# if female, then model that she survived (1) 
54
# if male, then model that he did not survive (0)
55

56
# First, read in test.csv
57
test_file = open('test.csv', 'rb')
58
test_file_object = csv.reader(test_file)
59
header = test_file_object.next()
60

61
# Also open the a new file so I can write to it. Call it something descriptive
62
# Finally, loop through each row in the train file, and look in column index [3] (which is 'Sex')
63
# Write out the PassengerId, and my prediction.
64

65
predictions_file = open("gendermodel.csv", "wb")
66
predictions_file_object = csv.writer(predictions_file)
67
predictions_file_object.writerow(["PassengerId", "Survived"])	# write the column headers
68
for row in test_file_object:									# For each row in test file,
69
    if row[3] == 'female':										# is it a female, if yes then
70
        predictions_file_object.writerow([row[0], "1"])			# write the PassengerId, and predict 1
71
    else:														# or else if male,
72
        predictions_file_object.writerow([row[0], "0"])			# write the PassengerId, and predict 0.
73
test_file.close()												# Close out the files.
74
predictions_file.close()
75

76

77
Product

Resources

Company