Germán Martínez GermanCM

## gradient_descent_optimization.py
import math
import numpy as np

# get the overall cost of the model
def compute_cost(X, y, coeff):
    '''
    inputs:
      * 'X': features matrix (independent variables)
      * 'y': target values (dependent variable)
      * 'coeff': regression coefficients

## features_normalization.py
def normalize_features(df):
    """
    Normalize the features in the data set.

    Returns the normalized values, mean and standard deviation for each feature
    """
    mu = df.mean()
    sigma = df.std()

    if (sigma == 0).any():

## r2_score.py
def find_r2_score(labels_test, predicted_outputs):
    from sklearn.metrics import r2_score
    corr_coeff = r2_score(labels_test, predicted_outputs)
    print('the value of r2 is: ', corr_coeff)

## normalize_and_predict.py
# returns the dependent variable (y axis) value which the model assigns to a certain independent variable (x axis) value
def predict_output(feature_matrix, coefficients):
    '''
    inputs:
        * feature_matrix: two-dimensions array of the data points, where each columns is a feature and a row a point
        * coefficients: one-dimension array of estimated feature coefficients

    output:
        * one-dimension array of predictions
    '''

## scikit_linear_regression.py
def linear_regression_via_scikit(X_train, y_train):
  from sklearn import linear_model

  linear_reg = linear_model.LinearRegression()
  linear_reg.fit(X_train.values.reshape((len(X_train),1)), y_train.values.reshape((len(X_train),1)))

  return linear_reg

## correlation_coeff_matrix.py
def get_corr_coeff(dataframe):
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns
    # data is the dataframe having the attributes which you want to get the correlation coeffcients from
    cm = np.corrcoef(dataframe[dataframe.columns].values.T)
    sns.set(font_scale=1.5)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', \
        annot_kws={'size': 10}, yticklabels=dataframe.columns, xticklabels=dataframe.columns)
    plt.show()

## attributes_imputer.py
def imputeMissingValues(dataframe, desired_strategy, attributes_to_impute):
    from sklearn.preprocessing import Imputer

    for attr in attributes_to_impute:
        values_ = dataframe[attr].values.reshape(-1, 1)
        imp = Imputer(missing_values=np.nan, strategy=desired_strategy, axis=0)
        imp.fit(values_)
        transformed_values = imp.transform(values_)
        dataframe.loc[:,attr] = transformed_values
    return dataframe

## one_hot_encoding_stratified.py
# source: https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas
# func that returns a dummified DataFrame of significant dummies in a given column
def dum_sign(dummy_col, threshold=0.1):
    import pandas as pd
    import numpy as np

    # removes the bind
    dummy_col = dummy_col.copy()

    # what is the ratio of a dummy in whole column

## profile_dataframe.py
def profile_dataframe(dataframe):
  import pandas_profiling as pp

  profile = pp.ProfileReport(dataframe)
  profile.to_file(outputfile="df_profiling_report.html")
  return

## get_drive_file_to_dataframe.py
def loadDataFromDrive(dataLink, fileName):
  '''
    dataLink: link obtained from the right button option 'get shareable link' in drive
    fileName: name of the file in frive
  '''
  # Code to read csv file into Colaboratory:
  get_ipython().system('pip install -U -q PyDrive')
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
	import math
	import numpy as np

	# get the overall cost of the model
	def compute_cost(X, y, coeff):
	'''
	inputs:
	* 'X': features matrix (independent variables)
	* 'y': target values (dependent variable)
	* 'coeff': regression coefficients
	def normalize_features(df):
	"""
	Normalize the features in the data set.

	Returns the normalized values, mean and standard deviation for each feature
	"""
	mu = df.mean()
	sigma = df.std()

	if (sigma == 0).any():
	def find_r2_score(labels_test, predicted_outputs):
	from sklearn.metrics import r2_score
	corr_coeff = r2_score(labels_test, predicted_outputs)
	print('the value of r2 is: ', corr_coeff)
	# returns the dependent variable (y axis) value which the model assigns to a certain independent variable (x axis) value
	def predict_output(feature_matrix, coefficients):
	'''
	inputs:
	* feature_matrix: two-dimensions array of the data points, where each columns is a feature and a row a point
	* coefficients: one-dimension array of estimated feature coefficients

	output:
	* one-dimension array of predictions
	'''
	def linear_regression_via_scikit(X_train, y_train):
	from sklearn import linear_model

	linear_reg = linear_model.LinearRegression()
	linear_reg.fit(X_train.values.reshape((len(X_train),1)), y_train.values.reshape((len(X_train),1)))

	return linear_reg
	def get_corr_coeff(dataframe):
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	# data is the dataframe having the attributes which you want to get the correlation coeffcients from
	cm = np.corrcoef(dataframe[dataframe.columns].values.T)
	sns.set(font_scale=1.5)
	hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', \
	annot_kws={'size': 10}, yticklabels=dataframe.columns, xticklabels=dataframe.columns)
	plt.show()
	def imputeMissingValues(dataframe, desired_strategy, attributes_to_impute):
	from sklearn.preprocessing import Imputer

	for attr in attributes_to_impute:
	values_ = dataframe[attr].values.reshape(-1, 1)
	imp = Imputer(missing_values=np.nan, strategy=desired_strategy, axis=0)
	imp.fit(values_)
	transformed_values = imp.transform(values_)
	dataframe.loc[:,attr] = transformed_values
	return dataframe
	# source: https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas
	# func that returns a dummified DataFrame of significant dummies in a given column
	def dum_sign(dummy_col, threshold=0.1):
	import pandas as pd
	import numpy as np

	# removes the bind
	dummy_col = dummy_col.copy()

	# what is the ratio of a dummy in whole column
	def profile_dataframe(dataframe):
	import pandas_profiling as pp

	profile = pp.ProfileReport(dataframe)
	profile.to_file(outputfile="df_profiling_report.html")
	return
	def loadDataFromDrive(dataLink, fileName):
	'''
	dataLink: link obtained from the right button option 'get shareable link' in drive
	fileName: name of the file in frive
	'''
	# Code to read csv file into Colaboratory:
	get_ipython().system('pip install -U -q PyDrive')
	from pydrive.auth import GoogleAuth
	from pydrive.drive import GoogleDrive
	from google.colab import auth