CoCalc -- genderclassmodel.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / data / titanic / genderclassmodel.py
¹³²⁹⁵⁷ views
License: OTHER
1
""" Now that the user can read in a file this creates a model which uses the price, class and gender
2
Author : AstroDave
3
Date : 18th September 2012
4
Revised : 28 March 2014
5

6
"""
7

8

9
import csv as csv
10
import numpy as np
11

12
csv_file_object = csv.reader(open('train.csv', 'rb'))       # Load in the csv file
13
header = csv_file_object.next()                             # Skip the fist line as it is a header
14
data=[]                                                     # Create a variable to hold the data
15

16
for row in csv_file_object:                 # Skip through each row in the csv file
17
    data.append(row)                        # adding each row to the data variable
18
data = np.array(data)                       # Then convert from a list to an array
19

20
# In order to analyse the price column I need to bin up that data
21
# here are my binning parameters, the problem we face is some of the fares are very large
22
# So we can either have a lot of bins with nothing in them or we can just lose some
23
# information by just considering that anythng over 39 is simply in the last bin.
24
# So we add a ceiling
25
fare_ceiling = 40
26
# then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
27
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0
28

29
fare_bracket_size = 10
30
number_of_price_brackets = fare_ceiling / fare_bracket_size
31
number_of_classes = 3                             # I know there were 1st, 2nd and 3rd classes on board.
32
number_of_classes = len(np.unique(data[0::,2]))   # But it's better practice to calculate this from the Pclass directly:
33
                                                  # just take the length of an array of UNIQUE values in column index 2
34

35

36
# This reference matrix will show the proportion of survivors as a sorted table of
37
# gender, class and ticket fare.
38
# First initialize it with all zeros
39
survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float)
40

41
# I can now find the stats of all the women and men on board
42
for i in xrange(number_of_classes):
43
    for j in xrange(number_of_price_brackets):
44

45
        women_only_stats = data[ (data[0::,4] == "female") \
46
                                 & (data[0::,2].astype(np.float) == i+1) \
47
                                 & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
48
                                 & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
49

50
        men_only_stats = data[ (data[0::,4] != "female") \
51
                                 & (data[0::,2].astype(np.float) == i+1) \
52
                                 & (data[0:,9].astype(np.float) >= j*fare_bracket_size) \
53
                                 & (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]
54

55
                                 #if i == 0 and j == 3:
56

57
        survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))  # Female stats
58
        survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))    # Male stats
59

60
# Since in python if it tries to find the mean of an array with nothing in it
61
# (such that the denominator is 0), then it returns nan, we can convert these to 0
62
# by just saying where does the array not equal the array, and set these to 0.
63
survival_table[ survival_table != survival_table ] = 0.
64

65
# Now I have my proportion of survivors, simply round them such that if <0.5
66
# I predict they dont surivive, and if >= 0.5 they do
67
survival_table[ survival_table < 0.5 ] = 0
68
survival_table[ survival_table >= 0.5 ] = 1
69

70
# Now I have my indicator I can read in the test file and write out
71
# if a women then survived(1) if a man then did not survived (0)
72
# First read in test
73
test_file = open('test.csv', 'rb')
74
test_file_object = csv.reader(test_file)
75
header = test_file_object.next()
76

77
# Also open the a new file so I can write to it. 
78
predictions_file = open("genderclassmodel.csv", "wb")
79
predictions_file_object = csv.writer(predictions_file)
80
predictions_file_object.writerow(["PassengerId", "Survived"])
81

82
# First thing to do is bin up the price file
83
for row in test_file_object:
84
    for j in xrange(number_of_price_brackets):
85
        # If there is no fare then place the price of the ticket according to class
86
        try:
87
            row[8] = float(row[8])    # No fare recorded will come up as a string so
88
                                      # try to make it a float
89
        except:                       # If fails then just bin the fare according to the class
90
            bin_fare = 3 - float(row[1])
91
            break                     # Break from the loop and move to the next row
92
        if row[8] > fare_ceiling:     # Otherwise now test to see if it is higher
93
                                      # than the fare ceiling we set earlier
94
            bin_fare = number_of_price_brackets - 1
95
            break                     # And then break to the next row
96

97
        if row[8] >= j*fare_bracket_size\
98
            and row[8] < (j+1)*fare_bracket_size:     # If passed these tests then loop through
99
                                                      # each bin until you find the right one
100
                                                      # append it to the bin_fare
101
                                                      # and move to the next loop
102
            bin_fare = j
103
            break
104
        # Now I have the binned fare, passenger class, and whether female or male, we can
105
        # just cross ref their details with our survival table
106
    if row[3] == 'female':
107
        predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])
108
    else:
109
        predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])
110

111
# Close out the files
112
test_file.close()
113
predictions_file.close()
114
Product

Resources

Company