Skip to content

Instantly share code, notes, and snippets.

@ShrashtiSinghal
ShrashtiSinghal / Survey.csv
Last active August 30, 2020 10:27
Best groups
We can make this file beautiful and searchable if this error is corrected: It looks like row 6 should actually have 6 columns, instead of 5. in line 5.
S.No., Group Name, Group Link, Number of Members in K*, Social Media, Average Rating
3, Data Science - R & Python, https://www.facebook.com/groups/AnalyticsEdge/, 217, Facebook, 88.7
5, Data Science, https://www.facebook.com/groups/DataScienceGroup/, 137, Facebook , 88.6
15, Data Science with Python, https://www.facebook.com/groups/1006538092836222/, 56.5, Facebook, 84.5
6, Machine Learning and Data Science, https://www.linkedin.com/groups/4298680/, 128, Linkedin, 68.34
13, Python Data Science and Machine Learning, https://www.linkedin.com/groups/4388870/, 63, L
#Cross Validation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso,ElasticNet, Ridge, MultiTaskLasso, LassoLars, OrthogonalMatchingPursuit
from sklearn.model_selection import train_test_split
#Regularization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso,ElasticNet, Ridge, MultiTaskLasso, LassoLars, OrthogonalMatchingPursuit
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Test Train Split
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
class predit:
def bestFitLine(self):
#Log Transformation to remove skewness
y = np.log1p(y)
for col in x.columns:
if np.abs(x[col].skew()) > 0.3:
x[col] = np.log1p(x[col])
#Scale all the attributes except RAD, CHAS, ZN
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
header_new = ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE','DIS','TAX', 'PTRATIO','B','LSTAT']
x = data.loc[:,header_new]
y = data['MEDV']
x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=header_new)
#Find Percentage of outliers in every column
for k, v in data.items():
q1 = v.quantile(0.25)
q3 = v.quantile(0.75)
IQR = q3 - q1
v_col = v[(v <= q1 - 1.5 * IQR) | (v >= q3 + 1.5 * IQR)]
percentage = np.shape(v_col)[0] * 100.0 / np.shape(data)[0]
print("Column %s outliers = %.2f%%" % (k, percentage))
#HeatMap to find the corelated features
plt.figure(figsize=(25, 10))
sns.heatmap(data.corr().abs(), annot=True)
#Visualze Scale of the data
import seaborn as sns
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(15, 7))
ax.set_xscale("log")
for k,v in data.items():
sns.boxplot(data=df, orient='h')
plt.grid()
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
#Visualize Box plots for outliers
fig, axs = plt.subplots(ncols=5, nrows=3, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in data.items():
sns.boxplot(x=k, data=data, ax=axs[index], color="orange")
index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)
data = data[~(data['MEDV'] >= 50.0)]
print(np.shape(data))