jvrsgsty/util.py

## util.py
# -*- coding: utf-8 -*-
# Javier Sagastuy
# Last revision: Feb 21, 2014

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm
import statsmodels.formula.api as sm

def influenceM(fitted):
	'''
	Prints a table with a collection of influence measures for the fitted model
	given
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	influence = fitted.get_influence()
	# includes dffits, Hat diag, Cooks and studentized residuals
	# (internally and externally)
	print influence.summary_table()
	print "dfbetas 1       dfbetas x"
	print influence.dfbetas
	print influence.cov_ratio

def influenceG(fitted):
	'''
	Plots four residual analysis graph for the fitted model given:
	Residuals vs. fitted
	qqplot
	Scale-location
	Residuals vs. Leverage
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	resid = fitted.resid
	respuesta = fitted.fittedvalues
	std_resid = fitted.norm_resid()
	influence = fitted.get_influence()

	# Residuals vs. fitted
	plt.plot(respuesta, resid, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales')
	plt.title("Analisis de residuales")
	plt.show()

	# qqplot
	stats.probplot(resid,dist = 'norm', plot=plt)
	plt.show()

	# Scale-location
	plt.plot(respuesta, abs(std_resid)**0.5, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales Estandarizados')
	plt.title("Analisis de residuales")
	plt.show()

	# Residuals vs. leverage
	plt.plot(influence.hat_matrix_diag, std_resid, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales Estandarizados')
	plt.title("Analisis de residuales")
	plt.show()

def anova_JSB(fitted):
	'''
	Prints the anova_lm table for the fitted model given and writes a title
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	print "\n                                    ANOVA"
	print "=============================================================================="
	print anova_lm(fitted)

def aicSteps(model, dat, k):
	'''
	Function that calculates k aicSteps to construct the model which contains
	the k regressors whose addition into the model minimizes the AIC
	IN
	model: a statsmodels.regression.linear_model.OLS object
	dat: the original data frame that was given to the model
	k: an integer specifying the number of aicSteps to take
	'''
	names = model.data.xnames
	names.remove('Intercept')
	inModel = []
	formula = 'y ~ 1'
	print 'Start:  ' + formula
	for i in range(k):
		(formula, inModel, names) = aicStepInt(formula, inModel, names, dat)
		print 'Step '+ str(i+1) + ': ' + formula


def aicStepInt(formula, inModel, names, dat):
	'''
	Function used internally to calculate the AIC step. Must be called initially
	with the parameters it is called in aicSteps
	IN
	formula: a string with the formula of the adjustted model
	inModel: a list containing the regressors already in the model
	names: a list containing the names which are not yet in the model
	dat: the original data frame that was given to the model
	'''
	AIC = []
	for regressor in names:
		model2 = sm.ols(formula=formula + ' + ' + regressor, data = dat)
		fitted2 = model2.fit()
		AIC.append(fitted2.aic)
	AIC = np.array(AIC)
	k = AIC.argmin()
	formula += ' + ' + names[k]
	inModel.append(names[k])
	names.remove(names[k])
	return (formula, inModel, names)
	# -- coding: utf-8 --
	# Javier Sagastuy
	# Last revision: Feb 21, 2014

	import numpy as np
	import matplotlib.pyplot as plt
	import scipy.stats as stats
	from statsmodels.stats.anova import anova_lm
	import statsmodels.formula.api as sm

	def influenceM(fitted):
	'''
	Prints a table with a collection of influence measures for the fitted model
	given
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	influence = fitted.get_influence()
	# includes dffits, Hat diag, Cooks and studentized residuals
	# (internally and externally)
	print influence.summary_table()
	print "dfbetas 1 dfbetas x"
	print influence.dfbetas
	print influence.cov_ratio

	def influenceG(fitted):
	'''
	Plots four residual analysis graph for the fitted model given:
	Residuals vs. fitted
	qqplot
	Scale-location
	Residuals vs. Leverage
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	resid = fitted.resid
	respuesta = fitted.fittedvalues
	std_resid = fitted.norm_resid()
	influence = fitted.get_influence()

	# Residuals vs. fitted
	plt.plot(respuesta, resid, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales')
	plt.title("Analisis de residuales")
	plt.show()

	# qqplot
	stats.probplot(resid,dist = 'norm', plot=plt)
	plt.show()

	# Scale-location
	plt.plot(respuesta, abs(std_resid)**0.5, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales Estandarizados')
	plt.title("Analisis de residuales")
	plt.show()

	# Residuals vs. leverage
	plt.plot(influence.hat_matrix_diag, std_resid, 'bo')
	plt.xlabel('Respuesta ajustada')
	plt.ylabel('Residuales Estandarizados')
	plt.title("Analisis de residuales")
	plt.show()

	def anova_JSB(fitted):
	'''
	Prints the anova_lm table for the fitted model given and writes a title
	IN
	fitted: a statsmodels.regression.linear_model.RegressionResultsWrapper
	object
	'''
	print "\n ANOVA"
	print "=============================================================================="
	print anova_lm(fitted)

	def aicSteps(model, dat, k):
	'''
	Function that calculates k aicSteps to construct the model which contains
	the k regressors whose addition into the model minimizes the AIC
	IN
	model: a statsmodels.regression.linear_model.OLS object
	dat: the original data frame that was given to the model
	k: an integer specifying the number of aicSteps to take
	'''
	names = model.data.xnames
	names.remove('Intercept')
	inModel = []
	formula = 'y ~ 1'
	print 'Start: ' + formula
	for i in range(k):
	(formula, inModel, names) = aicStepInt(formula, inModel, names, dat)
	print 'Step '+ str(i+1) + ': ' + formula


	def aicStepInt(formula, inModel, names, dat):
	'''
	Function used internally to calculate the AIC step. Must be called initially
	with the parameters it is called in aicSteps
	IN
	formula: a string with the formula of the adjustted model
	inModel: a list containing the regressors already in the model
	names: a list containing the names which are not yet in the model
	dat: the original data frame that was given to the model
	'''
	AIC = []
	for regressor in names:
	model2 = sm.ols(formula=formula + ' + ' + regressor, data = dat)
	fitted2 = model2.fit()
	AIC.append(fitted2.aic)
	AIC = np.array(AIC)
	k = AIC.argmin()
	formula += ' + ' + names[k]
	inModel.append(names[k])
	names.remove(names[k])
	return (formula, inModel, names)