jmsword/naive_bayes.py

## naive_bayes.py
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB

#I kept getting this error 'pandas.io.common.CParserError: Error tokenizing data. C error: Expected 1 fields in line 104, saw 3'
#when trying to read in the data from GitHub so I just copied the data into a csv file and saved it locally
df = pd.read_csv('ideal_weight.csv')

#Remove single '' from coulmn names
df.rename(columns=lambda x: x.replace("'", ""), inplace=True)

#Remove single quotes from sex column
df['sex'] = df['sex'].map(lambda x: x.replace("'", ""))

#Plot histogram of actual & ideal weight
plt.figure()
a = df['actual'].hist()
i = df['ideal'].hist()
plt.show()

#Plot histogram of difference in weight
plt.figure()
d = df['diff'].hist()
plt.show()

#Make sex a categorical variable
df['sex'] = pd.Categorical(df['sex']).codes

#Check to see if there are more females than males in the data
print(df.groupby('sex').describe())

#Create training & testing data
train_set = int(len(df) * 0.7)
train = df[:train_set]
test = df[train_set:]

#Create variables to fit into classifier model
train_target = train['sex']
train_data = train.ix[:,2:]

#Classifier
clf = GaussianNB()
#Fit into model
clf.fit(train_data, train_target)

#Create variables to predict: 0 = female, 1 = male
test_target = test['sex']
test_data = test.ix[:,2:]
pred = clf.predict(test_data)

#Make first prediction
print(clf.predict([145,160,-15]))
#Make second prediction
print(clf.predict([160,145,15]))
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.naive_bayes import GaussianNB

	#I kept getting this error 'pandas.io.common.CParserError: Error tokenizing data. C error: Expected 1 fields in line 104, saw 3'
	#when trying to read in the data from GitHub so I just copied the data into a csv file and saved it locally
	df = pd.read_csv('ideal_weight.csv')

	#Remove single '' from coulmn names
	df.rename(columns=lambda x: x.replace("'", ""), inplace=True)

	#Remove single quotes from sex column
	df['sex'] = df['sex'].map(lambda x: x.replace("'", ""))

	#Plot histogram of actual & ideal weight
	plt.figure()
	a = df['actual'].hist()
	i = df['ideal'].hist()
	plt.show()

	#Plot histogram of difference in weight
	plt.figure()
	d = df['diff'].hist()
	plt.show()

	#Make sex a categorical variable
	df['sex'] = pd.Categorical(df['sex']).codes

	#Check to see if there are more females than males in the data
	print(df.groupby('sex').describe())

	#Create training & testing data
	train_set = int(len(df) * 0.7)
	train = df[:train_set]
	test = df[train_set:]

	#Create variables to fit into classifier model
	train_target = train['sex']
	train_data = train.ix[:,2:]

	#Classifier
	clf = GaussianNB()
	#Fit into model
	clf.fit(train_data, train_target)

	#Create variables to predict: 0 = female, 1 = male
	test_target = test['sex']
	test_data = test.ix[:,2:]
	pred = clf.predict(test_data)

	#Make first prediction
	print(clf.predict([145,160,-15]))
	#Make second prediction
	print(clf.predict([160,145,15]))