joshstrupp/NFL-win-loss-predictor-based-on-game-stats.py

## NFL-win-loss-predictor-based-on-game-stats.py

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from math import exp
import numpy as np
import matplotlib.pyplot as plt

nfl2000 = pd.read_csv('nfl2000stats.csv', sep=',') #13-3
nfl2001 = pd.read_csv('nfl2001stats.csv', sep=',') #7-9
nfl2002 = pd.read_csv('nfl2002stats.csv', sep=',') #11-5
nfl2003 = pd.read_csv('nfl2003stats.csv', sep=',') #12-4
nfl2004 = pd.read_csv('nfl2004stats.csv', sep=',') #5-11
nfl2005 = pd.read_csv('nfl2005stats.csv', sep=',') #4-12
nfl2006 = pd.read_csv('nfl2006stats.csv', sep=',') #8-8
nfl2007 = pd.read_csv('nfl2007stats.csv', sep=',') #10-6
nfl2008 = pd.read_csv('nfl2008stats.csv', sep=',') #13-3
nfl2009 = pd.read_csv('nfl2009stats.csv', sep=',') #8-8
nfl2010 = pd.read_csv('nfl2010stats.csv', sep=',') #6-10
nfl2011 = pd.read_csv('nfl2011stats.csv', sep=',') #9-7
nfl2012 = pd.read_csv('nfl2012stats.csv', sep=',') #6-10
nfl2013 = pd.read_csv('nfl2013stats.csv', sep=',') #7-9


nfl = pd.concat([nfl2000, nfl2001, nfl2002, nfl2003, nfl2004, nfl2005, nfl2006, nfl2007, nfl2008, nfl2009, nfl2010, nfl2011, nfl2012, nfl2013], axis=0)
nfl['WinLoss'] = np.where(nfl.ScoreOff > nfl.ScoreDef, 1, 0)

nfl.columns

feature_cols = ['Date', 'FirstDownDef', 'FirstDownOff', 'FumblesDef', 'FumblesOff', 'Line', 'Opponent', 'PassAttDef', 'PassAttOff', 'PassCompDef', 'PassCompOff', 'PassIntDef', 'PassIntOff', 'PassYdsDef', 'PassYdsOff', 'PenYdsDef', 'PenYdsOff', 'PuntAvgOff', 'RushAttDef', 'RushAttOff', 'RushYdsDef', 'RushYdsOff', 'SackNumDef', 'SackNumOff', 'SackYdsDef', 'SackYdsOff', 'ScoreDef', 'ScoreOff', 'Site', 'TeamName', 'ThirdDownPctDef', 'ThirdDownPctOff', 'TimePossDef', 'TimePossOff', 'TotalLine', 'Totalline', 'Totalline ', 'WinLoss']

X = nfl[feature_cols]
y = nfl.WinLoss

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)

# Fit model -- In the linear regression model the dependent variable y is considered continuous, whereas in logistic regression it is categorical, i.e., discrete. In application, the former is used in regression settings while the latter is used for binary classification or multi-class classification (where it is called multinomial logistic regression)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	from math import exp
	import numpy as np
	import matplotlib.pyplot as plt

	nfl2000 = pd.read_csv('nfl2000stats.csv', sep=',') #13-3
	nfl2001 = pd.read_csv('nfl2001stats.csv', sep=',') #7-9
	nfl2002 = pd.read_csv('nfl2002stats.csv', sep=',') #11-5
	nfl2003 = pd.read_csv('nfl2003stats.csv', sep=',') #12-4
	nfl2004 = pd.read_csv('nfl2004stats.csv', sep=',') #5-11
	nfl2005 = pd.read_csv('nfl2005stats.csv', sep=',') #4-12
	nfl2006 = pd.read_csv('nfl2006stats.csv', sep=',') #8-8
	nfl2007 = pd.read_csv('nfl2007stats.csv', sep=',') #10-6
	nfl2008 = pd.read_csv('nfl2008stats.csv', sep=',') #13-3
	nfl2009 = pd.read_csv('nfl2009stats.csv', sep=',') #8-8
	nfl2010 = pd.read_csv('nfl2010stats.csv', sep=',') #6-10
	nfl2011 = pd.read_csv('nfl2011stats.csv', sep=',') #9-7
	nfl2012 = pd.read_csv('nfl2012stats.csv', sep=',') #6-10
	nfl2013 = pd.read_csv('nfl2013stats.csv', sep=',') #7-9




	nfl = pd.concat([nfl2000, nfl2001, nfl2002, nfl2003, nfl2004, nfl2005, nfl2006, nfl2007, nfl2008, nfl2009, nfl2010, nfl2011, nfl2012, nfl2013], axis=0)
	nfl['WinLoss'] = np.where(nfl.ScoreOff > nfl.ScoreDef, 1, 0)

	nfl.columns

	feature_cols = ['Date', 'FirstDownDef', 'FirstDownOff', 'FumblesDef', 'FumblesOff', 'Line', 'Opponent', 'PassAttDef', 'PassAttOff', 'PassCompDef', 'PassCompOff', 'PassIntDef', 'PassIntOff', 'PassYdsDef', 'PassYdsOff', 'PenYdsDef', 'PenYdsOff', 'PuntAvgOff', 'RushAttDef', 'RushAttOff', 'RushYdsDef', 'RushYdsOff', 'SackNumDef', 'SackNumOff', 'SackYdsDef', 'SackYdsOff', 'ScoreDef', 'ScoreOff', 'Site', 'TeamName', 'ThirdDownPctDef', 'ThirdDownPctOff', 'TimePossDef', 'TimePossOff', 'TotalLine', 'Totalline', 'Totalline ', 'WinLoss']

	X = nfl[feature_cols]
	y = nfl.WinLoss

	# Train test split
	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)

	# Fit model -- In the linear regression model the dependent variable y is considered continuous, whereas in logistic regression it is categorical, i.e., discrete. In application, the former is used in regression settings while the latter is used for binary classification or multi-class classification (where it is called multinomial logistic regression)
	logreg = LogisticRegression()
	logreg.fit(X_train, y_train)