Asad Mumtaz finlytics-hub

## LR_stochastic_GD.py
'''
NORMAL EQUATION
'''
# generate some random numbers (independent variable)
X = 5 * np.random.rand(500,1)
# calculate linearly related (plus some noise) target variable in the form y = 10 + 2x + noise
y = 10 + 2 * X + np.random.randn(500,1)
# add ones to X for each observation (X0)
X_2d = np.c_[np.ones((500, 1)), X]
# calculate theta that minimizes MSE through Normal Equation

## LR_batch_GD.py
'''
NORMAL EQUATION
'''
# generate some random numbers (independent variable)
X = 5 * np.random.rand(500,1)
# calculate linearly related (plus some noise) target variable in the form y = 10 + 2x + noise
y = 10 + 2 * X + np.random.randn(500,1)
# add ones to X for each observation (X0)
X_2d = np.c_[np.ones((500, 1)), X]
# calculate theta that minimizes MSE through Normal Equation

## LR_norm_eq_pred.py
# make predictions on new data - deliberately chosen the minimum and maximum possible X values to plot a straight line
X_new = np.array([[0], [5]])
# add ones
X_new_2d = np.c_[np.ones((2, 1)), X_new]
# predict
y_predict = X_new_2d.dot(theta_best)
y_predict

## LR_norm_eq.py
# generate some random numbers (independent variable)
X = 5 * np.random.rand(500,1)
# calculate linearly related (plus some noise) target variable in the form y = 10 + 2x + noise
y = 10 + 2 * X + np.random.randn(500,1)
# add ones to X for each observation (X0)
X_2d = np.c_[np.ones((500, 1)), X]
# calculate theta that minimizes MSE through Normal Equation
theta_best = np.linalg.inv(X_2d.T.dot(X_2d)).dot(X_2d.T).dot(y)
theta_best

## ODI_app.py
# import all the required libraries
from flask import Flask, request, url_for, render_template
from sklearn.linear_model import LogisticRegression
import pandas as pd
import pickle
import numpy as np

# instantiate a Flask app
app = Flask(__name__)
# load our saved model

## ODI_model_training_validation.py
# drop features we decided not to keep as a result of the Feature Selection Steps above
X = final_data.drop(columns = ['Ground', 'Match Date', 'Result', 'Toss Won?', 'Match Month', 'Country Total Bowling Rank', 'Country Total Batting Rank', 'Opposition Total Bowling Rank', 'Opposition Total Batting Rank', 'Country Average Bowling Rank', 'Country Average Batting Rank', 'Opposition Average Bowling Rank', 'Opposition Average Batting Rank', 'Country Median Bowling Rank', 'Country Median Batting Rank', 'Opposition Median Bowling Rank', 'Opposition Median Batting Rank'])
# get dummy variables of all the remaining categorical features
X = pd.get_dummies(X, columns = ['Country', 'Opposition', 'Home/Away'], drop_first = True)
# target variable after representing wins as 1 and losses as 0
y = final_data['Result'].replace({'Won': 1, 'Lost': 0})
# instantiate Logistic Regression with 'balanced' class_weight parameter just to make sure
reg = LogisticRegression(max_iter = 10000, class_weight = 'balanced')
# define the cros

## ODI_feature_selection.py
# since the player names are not useful for our feature selecting and model training steps, we will drop them after creating a backup of the DF
final_data.drop(columns = final_data.loc[:, 'Country_Player_1': 'Opposition_Player_12'].columns.values, inplace = True)


# Categorical Feature Selection
X = final_data.drop(columns = ['Result'], axis = 1)
y = final_data['Result']

# define an empty dictionary to store chi-test results
chi2_check = {}

## ODI_feature_engineering.py
# create a new Match Month column
final_data['Match Month'] = final_data['Match Date'].dt.month


## win/loss ratio
# instantiate an empty list
win_loss_ratio = []
# loop over the final_data DF for each playing Country in it
for c in final_data['Country'].unique():
    # slice all matches with 'c' country in the 'Country' column

## ODI_data_munging_2.py
# decompose ODI_matches into a long format
melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable'])

# WEB SCRAPING FOR RANKINGS
# instantiate an empty list to store the date-wise DFs
df_list = []

for d in ODI_matches['Match Date'].unique():
    # convert each unique date to a timestamp
    date = pd.Timestamp(d)

## ODI_data_munging_1.py
# ODI_results DF
# create a copy of the DF
ODI_results_clean = ODI_results.copy()
# split Match column into two separate columns
ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True)
# strip out all the leading and trailing whitespaces
ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip()
ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip()
# create the Opposition column using information from the 2 newly created columns
ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1'])
	'''
	NORMAL EQUATION
	'''
	# generate some random numbers (independent variable)
	X = 5 * np.random.rand(500,1)
	# calculate linearly related (plus some noise) target variable in the form y = 10 + 2x + noise
	y = 10 + 2 * X + np.random.randn(500,1)
	# add ones to X for each observation (X0)
	X_2d = np.c_[np.ones((500, 1)), X]
	# calculate theta that minimizes MSE through Normal Equation
	# make predictions on new data - deliberately chosen the minimum and maximum possible X values to plot a straight line
	X_new = np.array([[0], [5]])
	# add ones
	X_new_2d = np.c_[np.ones((2, 1)), X_new]
	# predict
	y_predict = X_new_2d.dot(theta_best)
	y_predict
	# import all the required libraries
	from flask import Flask, request, url_for, render_template
	from sklearn.linear_model import LogisticRegression
	import pandas as pd
	import pickle
	import numpy as np

	# instantiate a Flask app
	app = Flask(__name__)
	# load our saved model
	# drop features we decided not to keep as a result of the Feature Selection Steps above
	X = final_data.drop(columns = ['Ground', 'Match Date', 'Result', 'Toss Won?', 'Match Month', 'Country Total Bowling Rank', 'Country Total Batting Rank', 'Opposition Total Bowling Rank', 'Opposition Total Batting Rank', 'Country Average Bowling Rank', 'Country Average Batting Rank', 'Opposition Average Bowling Rank', 'Opposition Average Batting Rank', 'Country Median Bowling Rank', 'Country Median Batting Rank', 'Opposition Median Bowling Rank', 'Opposition Median Batting Rank'])
	# get dummy variables of all the remaining categorical features
	X = pd.get_dummies(X, columns = ['Country', 'Opposition', 'Home/Away'], drop_first = True)
	# target variable after representing wins as 1 and losses as 0
	y = final_data['Result'].replace({'Won': 1, 'Lost': 0})
	# instantiate Logistic Regression with 'balanced' class_weight parameter just to make sure
	reg = LogisticRegression(max_iter = 10000, class_weight = 'balanced')
	# define the cros
	# since the player names are not useful for our feature selecting and model training steps, we will drop them after creating a backup of the DF
	final_data.drop(columns = final_data.loc[:, 'Country_Player_1': 'Opposition_Player_12'].columns.values, inplace = True)


	# Categorical Feature Selection
	X = final_data.drop(columns = ['Result'], axis = 1)
	y = final_data['Result']

	# define an empty dictionary to store chi-test results
	chi2_check = {}
	# create a new Match Month column
	final_data['Match Month'] = final_data['Match Date'].dt.month


	## win/loss ratio
	# instantiate an empty list
	win_loss_ratio = []
	# loop over the final_data DF for each playing Country in it
	for c in final_data['Country'].unique():
	# slice all matches with 'c' country in the 'Country' column
	# decompose ODI_matches into a long format
	melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable'])

	# WEB SCRAPING FOR RANKINGS
	# instantiate an empty list to store the date-wise DFs
	df_list = []

	for d in ODI_matches['Match Date'].unique():
	# convert each unique date to a timestamp
	date = pd.Timestamp(d)
	# ODI_results DF
	# create a copy of the DF
	ODI_results_clean = ODI_results.copy()
	# split Match column into two separate columns
	ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True)
	# strip out all the leading and trailing whitespaces
	ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip()
	ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip()
	# create the Opposition column using information from the 2 newly created columns
	ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1'])