📚 The CoCalc Library - books, templates and other resources
License: OTHER
""" Now that the user can read in a file this creates a model which uses the price, class and gender1Author : AstroDave2Date : 18th September 20123Revised : 28 March 201445"""678import csv as csv9import numpy as np1011csv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file12header = csv_file_object.next() # Skip the fist line as it is a header13data=[] # Create a variable to hold the data1415for row in csv_file_object: # Skip through each row in the csv file16data.append(row) # adding each row to the data variable17data = np.array(data) # Then convert from a list to an array1819# In order to analyse the price column I need to bin up that data20# here are my binning parameters, the problem we face is some of the fares are very large21# So we can either have a lot of bins with nothing in them or we can just lose some22# information by just considering that anythng over 39 is simply in the last bin.23# So we add a ceiling24fare_ceiling = 4025# then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling26data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.02728fare_bracket_size = 1029number_of_price_brackets = fare_ceiling / fare_bracket_size30number_of_classes = 3 # I know there were 1st, 2nd and 3rd classes on board.31number_of_classes = len(np.unique(data[0::,2])) # But it's better practice to calculate this from the Pclass directly:32# just take the length of an array of UNIQUE values in column index 2333435# This reference matrix will show the proportion of survivors as a sorted table of36# gender, class and ticket fare.37# First initialize it with all zeros38survival_table = np.zeros([2,number_of_classes,number_of_price_brackets],float)3940# I can now find the stats of all the women and men on board41for i in xrange(number_of_classes):42for j in xrange(number_of_price_brackets):4344women_only_stats = data[ (data[0::,4] == "female") \45& (data[0::,2].astype(np.float) == i+1) \46& (data[0:,9].astype(np.float) >= j*fare_bracket_size) \47& (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]4849men_only_stats = data[ (data[0::,4] != "female") \50& (data[0::,2].astype(np.float) == i+1) \51& (data[0:,9].astype(np.float) >= j*fare_bracket_size) \52& (data[0:,9].astype(np.float) < (j+1)*fare_bracket_size), 1]5354#if i == 0 and j == 3:5556survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) # Female stats57survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float)) # Male stats5859# Since in python if it tries to find the mean of an array with nothing in it60# (such that the denominator is 0), then it returns nan, we can convert these to 061# by just saying where does the array not equal the array, and set these to 0.62survival_table[ survival_table != survival_table ] = 0.6364# Now I have my proportion of survivors, simply round them such that if <0.565# I predict they dont surivive, and if >= 0.5 they do66survival_table[ survival_table < 0.5 ] = 067survival_table[ survival_table >= 0.5 ] = 16869# Now I have my indicator I can read in the test file and write out70# if a women then survived(1) if a man then did not survived (0)71# First read in test72test_file = open('test.csv', 'rb')73test_file_object = csv.reader(test_file)74header = test_file_object.next()7576# Also open the a new file so I can write to it.77predictions_file = open("genderclassmodel.csv", "wb")78predictions_file_object = csv.writer(predictions_file)79predictions_file_object.writerow(["PassengerId", "Survived"])8081# First thing to do is bin up the price file82for row in test_file_object:83for j in xrange(number_of_price_brackets):84# If there is no fare then place the price of the ticket according to class85try:86row[8] = float(row[8]) # No fare recorded will come up as a string so87# try to make it a float88except: # If fails then just bin the fare according to the class89bin_fare = 3 - float(row[1])90break # Break from the loop and move to the next row91if row[8] > fare_ceiling: # Otherwise now test to see if it is higher92# than the fare ceiling we set earlier93bin_fare = number_of_price_brackets - 194break # And then break to the next row9596if row[8] >= j*fare_bracket_size\97and row[8] < (j+1)*fare_bracket_size: # If passed these tests then loop through98# each bin until you find the right one99# append it to the bin_fare100# and move to the next loop101bin_fare = j102break103# Now I have the binned fare, passenger class, and whether female or male, we can104# just cross ref their details with our survival table105if row[3] == 'female':106predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])107else:108predictions_file_object.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])109110# Close out the files111test_file.close()112predictions_file.close()113114