This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
# get the overall cost of the model | |
def compute_cost(X, y, coeff): | |
''' | |
inputs: | |
* 'X': features matrix (independent variables) | |
* 'y': target values (dependent variable) | |
* 'coeff': regression coefficients |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def normalize_features(df): | |
""" | |
Normalize the features in the data set. | |
Returns the normalized values, mean and standard deviation for each feature | |
""" | |
mu = df.mean() | |
sigma = df.std() | |
if (sigma == 0).any(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_r2_score(labels_test, predicted_outputs): | |
from sklearn.metrics import r2_score | |
corr_coeff = r2_score(labels_test, predicted_outputs) | |
print('the value of r2 is: ', corr_coeff) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# returns the dependent variable (y axis) value which the model assigns to a certain independent variable (x axis) value | |
def predict_output(feature_matrix, coefficients): | |
''' | |
inputs: | |
* feature_matrix: two-dimensions array of the data points, where each columns is a feature and a row a point | |
* coefficients: one-dimension array of estimated feature coefficients | |
output: | |
* one-dimension array of predictions | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def linear_regression_via_scikit(X_train, y_train): | |
from sklearn import linear_model | |
linear_reg = linear_model.LinearRegression() | |
linear_reg.fit(X_train.values.reshape((len(X_train),1)), y_train.values.reshape((len(X_train),1))) | |
return linear_reg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_corr_coeff(dataframe): | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import seaborn as sns | |
# data is the dataframe having the attributes which you want to get the correlation coeffcients from | |
cm = np.corrcoef(dataframe[dataframe.columns].values.T) | |
sns.set(font_scale=1.5) | |
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', \ | |
annot_kws={'size': 10}, yticklabels=dataframe.columns, xticklabels=dataframe.columns) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def imputeMissingValues(dataframe, desired_strategy, attributes_to_impute): | |
from sklearn.preprocessing import Imputer | |
for attr in attributes_to_impute: | |
values_ = dataframe[attr].values.reshape(-1, 1) | |
imp = Imputer(missing_values=np.nan, strategy=desired_strategy, axis=0) | |
imp.fit(values_) | |
transformed_values = imp.transform(values_) | |
dataframe.loc[:,attr] = transformed_values | |
return dataframe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas | |
# func that returns a dummified DataFrame of significant dummies in a given column | |
def dum_sign(dummy_col, threshold=0.1): | |
import pandas as pd | |
import numpy as np | |
# removes the bind | |
dummy_col = dummy_col.copy() | |
# what is the ratio of a dummy in whole column |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def profile_dataframe(dataframe): | |
import pandas_profiling as pp | |
profile = pp.ProfileReport(dataframe) | |
profile.to_file(outputfile="df_profiling_report.html") | |
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def loadDataFromDrive(dataLink, fileName): | |
''' | |
dataLink: link obtained from the right button option 'get shareable link' in drive | |
fileName: name of the file in frive | |
''' | |
# Code to read csv file into Colaboratory: | |
get_ipython().system('pip install -U -q PyDrive') | |
from pydrive.auth import GoogleAuth | |
from pydrive.drive import GoogleDrive | |
from google.colab import auth |
OlderNewer