Skip to content

Instantly share code, notes, and snippets.

View AdroitAnandAI's full-sized avatar

Anand P V AdroitAnandAI

View GitHub Profile
categories = ['west', 'central', 'urban']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Box Plot of Location vs Crime Rate', fontsize=18)
for idx, category in enumerate(categories):
sns.boxplot(x=cat,y='crmrte', data=crimeData,
ax=axes[idx]).set_title(category+' vs Crime Rate')
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle('Violin Plot of Location vs Crime Rate', fontsize=18)
for idx, category in enumerate(categories):
sns.violinplot(x=cat,y='crmrte', data=crimeData, size=8, ax=axes[idx]) \
.set_title(category +' vs Crime Rate')
plt.show()
# 40% correlation is taken as the threshold of strong feature
crimeData_corr = crimeData.corr()['crmrte']
selected_features_list = crimeData_corr[abs(crimeData_corr) > 0.4].sort_values(ascending=False)
# To plot data and linear regression model fit.
fig, ax = plt.subplots(round(len(selected_features_list) / 3), 3, figsize = (18, 12))
features = list(selected_features_list.index)
for i, ax in enumerate(fig.axes):
if i < len(features):
sns.regplot(x=features[i],y='crmrte',
sns.heatmap(crimeData.corr(),square = True, vmax=0.8)
@AdroitAnandAI
AdroitAnandAI / heatmap.py
Created December 15, 2018 00:31
heatmap
cols = crimeData_corr.nlargest(6,'crmrte')['crmrte'].index
cm = np.corrcoef(crimeData[cols].values.T)
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols.values ,
annot_kws = {'size':12},yticklabels = cols.values)
# Data Cleaning based on Data Analysis using Python & EDA
# Removing the wage outlier row based on EDA observation
crimeData = crimeData[crimeData.county != 185] # very high wser & prob of conviction
crimeData = crimeData[crimeData.county != 115] # prob of arrest > 1
# Removing rows with probability of arrest and conviction > 1
crimeData = crimeData[crimeData['prbarr'] < 1]
crimeData = crimeData[crimeData['prbconv'] < 1]
import statsmodels.api as sm
y = crimeData['crmrte']
X = crimeData['density']
X = sm.add_constant(X) # To add an intercept to our model
model = sm.OLS(y, X).fit()
model.summary()
@AdroitAnandAI
AdroitAnandAI / model3.py
Created December 15, 2018 00:35
model3
y = crimeData['crmrte']
X = crimeData[['density', 'urban']]
X = sm.add_constant(X) # To add an intercept to our model
model = sm.OLS(y, X).fit()
model.summary()
@AdroitAnandAI
AdroitAnandAI / model4.py
Created December 15, 2018 00:36
model4
y = crimeData['crmrte']
X = crimeData.drop('crmrte', axis=1)
# without a constant we are forcing our model to go through the origin
X = sm.add_constant(X) # To add an intercept to our model
# Note the difference in argument order
model = sm.OLS(y, X).fit()
# Print out the statistics
@AdroitAnandAI
AdroitAnandAI / model5.py
Created December 15, 2018 00:37
model5
y = crimeData['crmrte']
# Feature 'urban' is found to be worsen the model as per above analysis.
# Intuitively county shouldnt contribute prediction and also p value is high.
X = crimeData.drop(['crmrte', 'urban', 'county'], axis=1)
# without a constant we are forcing our model to go through the origin
X = sm.add_constant(X) # To add an intercept to our model
# Note the difference in argument order