Anand P V AdroitAnandAI

## biv3.py
categories = ['west', 'central', 'urban']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Box Plot of Location vs Crime Rate', fontsize=18)

for idx, category in enumerate(categories):
    sns.boxplot(x=cat,y='crmrte', data=crimeData,
                ax=axes[idx]).set_title(category+' vs Crime Rate')
plt.show()

## biv4.py
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Violin Plot of Location vs Crime Rate', fontsize=18)

for idx, category in enumerate(categories):
    sns.violinplot(x=cat,y='crmrte', data=crimeData, size=8, ax=axes[idx]) \
    	.set_title(category +' vs Crime Rate')
plt.show()

## fit.py
# 40% correlation is taken as the threshold of strong feature
crimeData_corr = crimeData.corr()['crmrte']
selected_features_list = crimeData_corr[abs(crimeData_corr) > 0.4].sort_values(ascending=False)

# To plot data and linear regression model fit.
fig, ax = plt.subplots(round(len(selected_features_list) / 3), 3, figsize = (18, 12))
features = list(selected_features_list.index)
for i, ax in enumerate(fig.axes):
    if i < len(features):
        sns.regplot(x=features[i],y='crmrte',

## feat.py
sns.heatmap(crimeData.corr(),square = True,  vmax=0.8)

## heatmap.py
cols = crimeData_corr.nlargest(6,'crmrte')['crmrte'].index
cm = np.corrcoef(crimeData[cols].values.T)
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
            linecolor="white",xticklabels = cols.values ,
            annot_kws = {'size':12},yticklabels = cols.values)

## clean.py
# Data Cleaning based on Data Analysis using Python & EDA

# Removing the wage outlier row based on EDA observation
crimeData = crimeData[crimeData.county != 185] # very high wser & prob of conviction
crimeData = crimeData[crimeData.county != 115] # prob of arrest > 1

# Removing rows with probability of arrest and conviction > 1
crimeData = crimeData[crimeData['prbarr'] < 1]
crimeData = crimeData[crimeData['prbconv'] < 1]

## model.py
import statsmodels.api as sm

y = crimeData['crmrte']
X = crimeData['density']
X = sm.add_constant(X) # To add an intercept to our model
model = sm.OLS(y, X).fit()
model.summary()

## model3.py
y = crimeData['crmrte']
X = crimeData[['density', 'urban']]
X = sm.add_constant(X) # To add an intercept to our model
model = sm.OLS(y, X).fit()
model.summary()

## model4.py
y = crimeData['crmrte']
X = crimeData.drop('crmrte', axis=1)

# without a constant we are forcing our model to go through the origin
X = sm.add_constant(X) # To add an intercept to our model

# Note the difference in argument order
model = sm.OLS(y, X).fit()

# Print out the statistics

## model5.py
y = crimeData['crmrte']

# Feature 'urban' is found to be worsen the model as per above analysis.
# Intuitively county shouldnt contribute prediction and also p value is high.
X = crimeData.drop(['crmrte', 'urban', 'county'], axis=1)

# without a constant we are forcing our model to go through the origin
X = sm.add_constant(X) # To add an intercept to our model

# Note the difference in argument order
	categories = ['west', 'central', 'urban']
	fig, axes = plt.subplots(1, 3, figsize=(15, 5))
	plt.suptitle('Box Plot of Location vs Crime Rate', fontsize=18)

	for idx, category in enumerate(categories):
	sns.boxplot(x=cat,y='crmrte', data=crimeData,
	ax=axes[idx]).set_title(category+' vs Crime Rate')
	plt.show()
	fig, axes = plt.subplots(1, 3, figsize=(15, 5))
	plt.suptitle('Violin Plot of Location vs Crime Rate', fontsize=18)

	for idx, category in enumerate(categories):
	sns.violinplot(x=cat,y='crmrte', data=crimeData, size=8, ax=axes[idx]) \
	.set_title(category +' vs Crime Rate')
	plt.show()
	# 40% correlation is taken as the threshold of strong feature
	crimeData_corr = crimeData.corr()['crmrte']
	selected_features_list = crimeData_corr[abs(crimeData_corr) > 0.4].sort_values(ascending=False)

	# To plot data and linear regression model fit.
	fig, ax = plt.subplots(round(len(selected_features_list) / 3), 3, figsize = (18, 12))
	features = list(selected_features_list.index)
	for i, ax in enumerate(fig.axes):
	if i < len(features):
	sns.regplot(x=features[i],y='crmrte',
	cols = crimeData_corr.nlargest(6,'crmrte')['crmrte'].index
	cm = np.corrcoef(crimeData[cols].values.T)
	sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
	linecolor="white",xticklabels = cols.values ,
	annot_kws = {'size':12},yticklabels = cols.values)
	# Data Cleaning based on Data Analysis using Python & EDA

	# Removing the wage outlier row based on EDA observation
	crimeData = crimeData[crimeData.county != 185] # very high wser & prob of conviction
	crimeData = crimeData[crimeData.county != 115] # prob of arrest > 1

	# Removing rows with probability of arrest and conviction > 1
	crimeData = crimeData[crimeData['prbarr'] < 1]
	crimeData = crimeData[crimeData['prbconv'] < 1]
	import statsmodels.api as sm

	y = crimeData['crmrte']
	X = crimeData['density']
	X = sm.add_constant(X) # To add an intercept to our model
	model = sm.OLS(y, X).fit()
	model.summary()
	y = crimeData['crmrte']
	X = crimeData[['density', 'urban']]
	X = sm.add_constant(X) # To add an intercept to our model
	model = sm.OLS(y, X).fit()
	model.summary()
	y = crimeData['crmrte']
	X = crimeData.drop('crmrte', axis=1)

	# without a constant we are forcing our model to go through the origin
	X = sm.add_constant(X) # To add an intercept to our model

	# Note the difference in argument order
	model = sm.OLS(y, X).fit()

	# Print out the statistics
	y = crimeData['crmrte']

	# Feature 'urban' is found to be worsen the model as per above analysis.
	# Intuitively county shouldnt contribute prediction and also p value is high.
	X = crimeData.drop(['crmrte', 'urban', 'county'], axis=1)

	# without a constant we are forcing our model to go through the origin
	X = sm.add_constant(X) # To add an intercept to our model

	# Note the difference in argument order