Skip to content

Instantly share code, notes, and snippets.

@zhengyangchoong
Created May 1, 2017 06:54
Show Gist options
  • Save zhengyangchoong/cee404d7c8cf71419a3e7dc6874afedb to your computer and use it in GitHub Desktop.
Save zhengyangchoong/cee404d7c8cf71419a3e7dc6874afedb to your computer and use it in GitHub Desktop.
import numpy as np
import random
import csv
import collections
import matplotlib.pyplot as plt
import math
from sklearn import linear_model
from sklearn import svm
import cPickle as pickle
plt.style.use('fivethirtyeight')
def parseCSV(filename): # return ordered lists for each variable? so like a dictionary of lists
predictors = {}
with open(filename, 'rb') as f:
reader = csv.reader(f, delimiter=",")
c = 0
for row in reader:
if c == 0:
headers = row
for i in headers:
predictors[i] = []
_a = headers.index("Sex")
_b = headers.index("Age")
else:
if row[_a] == "male":
row[_a] = -1
elif row[_a] == "female":
row[_a] = 1
if row[_b] == '':
row[_b] = 30
for i in xrange(len(row)):
try:
a = float(row[i])
except:
a = row[i]
if a == '':
predictors[headers[i]].append(None)
else:
predictors[headers[i]].append(a)
c += 1
return (headers,predictors)
# headers: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
def basicstuff(predictors): # no. of people. proportion of people who survived, sex distribution, class distribution, age distribution
print "No. of people: {}".format(len(predictors['PassengerId']))
print "No. survived: {}".format(sum(predictors['Survived'])) # survival fraction is 38.4%
def plotAgeHist():
plt.hist([x for x in predictors['Age'] if not x == None])
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig("histogram_age.png",bbox_inches = 'tight', dpi = 300)
plt.clf()
def plotClass():
_z = (collections.Counter(predictors['Pclass']))
print _z
x = [1.0,2.0,3.0]
y = [_z[float(x[i])] for i in xrange(3)]
print x
print y
#plt.xlabel("Passenger Class")
plt.ylabel("Frequency")
plt.bar(x,y,color='C0',align='center')
plt.xticks(x, ["First class", "Second class", "Third class"])
#plt.show()
plt.savefig("histogram_class.png", bbox_inches='tight', dpi=300)
plt.clf()
#print collections.Counter(predictors['Age'])
#x = (collections.Counter(predictors['Pclass']))
#print collections.Counter(predictors['Sex'])
def crossplots(predictors): # visually find correlations between two arrays
'''class against survival rate,
age with survival rate
sex with survival rate '''
def classSurvival():
s_1 = []
s_2 = []
s_3 = []
for i in xrange(len(predictors["Pclass"])):
if predictors["Pclass"][i] == 1.0:
s_1.append(predictors["Survived"][i])
elif predictors["Pclass"][i] == 2.0:
s_2.append(predictors["Survived"][i])
elif predictors["Pclass"][i] == 3.0:
s_3.append(predictors["Survived"][i])
y = [sum(s_1)/len(s_1), sum(s_2)/len(s_2), sum(s_3)/len(s_3)]
x = [1.0,2.0,3.0]
print np.corrcoef(x,y)
plt.plot(x,y)
plt.xticks(x, ["First class", "Second class", "Third class"])
#plt.xlabel("Class")
plt.ylabel("Probability of survival")
#plt.savefig("hist_survivalbyclass.png", bbox_inches = 'tight', dpi = 300)
plt.clf()
def ageSurvival():
ages = []
ids = []
_age = {}
for i in xrange(len(predictors["Age"])):
if predictors["Age"][i] == None:
continue
else:
ids.append(i)
ages.append(predictors["Age"][i])
for i in xrange(len(ages)):
x = math.floor(ages[i]/5.0)
if x not in _age:
_age[x] = [ids[i]]
else:
_age[x].append(ids[i])
survivalwithage = []
for i in sorted(_age.keys()):
_srate = 0
for _id in _age[i]:
_srate += predictors["Survived"][_id]
survivalwithage.append(_srate/len(_age[i]))
xes = [(i+1) * 5 for i in sorted(_age.keys())]
plt.xlabel("<Age")
plt.ylabel("Probability of survival")
plt.plot(xes,survivalwithage)
plt.savefig("agesurvival.png", bbox_inches='tight', dpi=300)
plt.clf()
def genderSurvival():
m = []
f = []
for i in xrange(len(predictors["Pclass"])):
if predictors["Sex"][i] == 1.0:
f.append(predictors["Survived"][i])
elif predictors["Sex"][i] == -1.0:
m.append(predictors["Survived"][i])
y = [sum(m)/len(m), sum(f)/len(f)]
print np.corrcoef([-1, 1], y)
classSurvival()
genderSurvival()
#ageSurvival()
def classify(predictors, testpredictors):
# produce list of array: [class, age, gender]
# produce another list of results
x = []
y = []
realx = []
for i in xrange(len(predictors["PassengerId"])):
_x = []
_x.append(predictors['Age'][i])
_x.append(predictors['Sex'][i])
_x.append(predictors['Pclass'][i])
x.append(_x)
y.append(predictors['Survived'][i])
ids = []
for i in xrange(len(testpredictors["PassengerId"])):
_x = []
_x.append(testpredictors['Age'][i])
_x.append(testpredictors['Sex'][i])
_x.append(testpredictors['Pclass'][i])
ids.append(testpredictors["PassengerId"][i])
realx.append(_x)
print len(realx)
x_train = x[:-50]
y_train = y[:-50]
x_test = x[-50:]
y_test = y[-50:]
#print x_train
def logisticfit():
logistic = linear_model.LogisticRegression(C=1e6)
logistic.fit(x_train, y_train)
testresult = logistic.predict(x_test)
scores = [1 for i in xrange(len(testresult)) if testresult[i] == y_test[i]]
print sum(scores)
print logistic.get_params()
def svmfit():
#svc = svm.SVC(kernel='poly', degree=2)
#svc.fit(x_train, y_train)
#testresult = svc.predict(x_test)
#scores = [1 for i in xrange(len(testresult)) if testresult[i] == y_test[i]]
#print scores
#pickle.dump(svc, open("model.p", "wb"))
svc = pickle.load(open("model.p", "rb"))
final = svc.predict(realx)
f = open("answer.csv", "w")
f.write("PassengerId,Survived\n")
for i in xrange(len(ids)):
f.write("{},{}\n".format(int(ids[i]), int(final[i])))
#logisticfit()
svmfit()
def main():
(headers, predictors) = parseCSV('train.csv')
crossplots(predictors)
#print "---"
#(_, testpredictors) = parseCSV('test.csv')
#classify(predictors,testpredictors)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment