Skip to content

Instantly share code, notes, and snippets.

@ShrashtiSinghal
ShrashtiSinghal / Survey.csv
Last active August 30, 2020 10:27
Best groups
We can make this file beautiful and searchable if this error is corrected: It looks like row 6 should actually have 6 columns, instead of 5. in line 5.
S.No., Group Name, Group Link, Number of Members in K*, Social Media, Average Rating
3, Data Science - R & Python, https://www.facebook.com/groups/AnalyticsEdge/, 217, Facebook, 88.7
5, Data Science, https://www.facebook.com/groups/DataScienceGroup/, 137, Facebook , 88.6
15, Data Science with Python, https://www.facebook.com/groups/1006538092836222/, 56.5, Facebook, 84.5
6, Machine Learning and Data Science, https://www.linkedin.com/groups/4298680/, 128, Linkedin, 68.34
13, Python Data Science and Machine Learning, https://www.linkedin.com/groups/4388870/, 63, L
#Cross Validation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso,ElasticNet, Ridge, MultiTaskLasso, LassoLars, OrthogonalMatchingPursuit
from sklearn.model_selection import train_test_split
# 7.c. Identify Skewed Data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
class predit:
def bestFitLine(self):
size=np.array([1491,1526,1533,1680,1680,1869,1890,1920,1936,1950,1953,2016,2117,3072,3182,3196]).reshape(-1,1)
#7.b. Identify Outliers in a dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class predit:
def bestFitLine(self):
size=np.array([1300,1491,1526,1533,1680,1680,1869,1890,1920,1936,1950,1953,2016,2117,3072,3182,3196,3842,5925,7879,9000])
#Regularization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso,ElasticNet, Ridge, MultiTaskLasso, LassoLars, OrthogonalMatchingPursuit
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Test Train Split
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
class predit:
def bestFitLine(self):
@ShrashtiSinghal
ShrashtiSinghal / 5.py
Last active August 8, 2020 11:45
Medium article 1
import numpy as np
from sklearn.linear_model import LinearRegression
class predit:
def bestFitLine(self,data):
size=np.array([500,650,700,780,900,1100,1150,2000,2200,2500]).reshape(-1,1)
price=np.array([1000,1500,1600,1770,2200,3000,3500,4400,4600,6000]).reshape(-1,1)
regressionLine=LinearRegression().fit(size,price)
pred=regressionLine.predict(size)
#Log Transformation to remove skewness
y = np.log1p(y)
for col in x.columns:
if np.abs(x[col].skew()) > 0.3:
x[col] = np.log1p(x[col])
#Scale all the attributes except RAD, CHAS, ZN
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
header_new = ['CRIM', 'INDUS', 'NOX', 'RM', 'AGE','DIS','TAX', 'PTRATIO','B','LSTAT']
x = data.loc[:,header_new]
y = data['MEDV']
x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=header_new)
#Find Percentage of outliers in every column
for k, v in data.items():
q1 = v.quantile(0.25)
q3 = v.quantile(0.75)
IQR = q3 - q1
v_col = v[(v <= q1 - 1.5 * IQR) | (v >= q3 + 1.5 * IQR)]
percentage = np.shape(v_col)[0] * 100.0 / np.shape(data)[0]
print("Column %s outliers = %.2f%%" % (k, percentage))