Aiden V Johnson AVJdataminer

## git_url.py
input_url = input("Enter the github url: ")
y = input_url.replace('/blob','')
x = y.replace('github.com','raw.githubusercontent.com')
print("raw url: ", x )

## Arima_functions.py
# Make a function to find the MSE of a single ARIMA model
def evaluate_arima_model(data, arima_order):
    # Needs to be an integer because it is later used as an index.
    split=int(len(data) * 0.8)
    train, test = data[0:split], data[split:len(data)]
    past=[x for x in train]
    # make predictions
    predictions = list()
    for i in range(len(test)):#timestep-wise comparison between test data and one-step prediction ARIMA model.
        model = ARIMA(past, order=arima_order)

## image-resize-500px.py
import PIL
from PIL import Image
import os
import sys

path = "/path/to/file"

dirs = [file for file in os.listdir(path) if file.endswith('.png')]

#pixelsize = 500;

## Train_test_date_split
def new_dt_split(date_col, X, y, input_date):
    date_col = pd.to_datetime(date_col)
    xw_date=pd.DataFrame(X).merge(date_col, left_index=True, right_index=True)
    X_train = xw_date.loc[xw_date['date'] <= input_date].drop(['date'], axis=1).values
    X_test = xw_date.loc[xw_date['date'] >= input_date].drop(['date'], axis=1).values
    yw_date=pd.DataFrame(y).merge(date_col, left_index=True, right_index=True)
    y_train = yw_date.loc[yw_date['date'] <= input_date].drop(['date'], axis=1).values
    y_test = yw_date.loc[yw_date['date'] >= input_date].drop(['date'], axis=1).values
    return X_train, X_test, y_train, y_test

## stratify_train_test.py
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=df['Country'],
                                                    test_size=0.25)

## date_train_test_split.py
from datetime import datetime,timedelta
def dt_splitter(date_col, X, y, test_size):
        xw_date=pd.DataFrame(X).merge(date_col,left_index=True, right_index=True)
        ad = (max(xw_date.date)- min(xw_date.date)).days*test_size
        split_date = min(xw_date.date) + timedelta(days=ad)
        X_train = xw_date.loc[xw_date['date'] <= split_date].drop(['date'], axis=1).values
        X_test = xw_date.loc[xw_date['date'] > split_date].drop(['date'], axis=1).values
        yw_date=pd.DataFrame(y).merge(date_col,left_index=True, right_index=True)
        y_train=yw_date.loc[yw_date['date'] <= split_date].drop(['date'], axis=1).values
        y_test=yw_date.loc[yw_date['date'] > split_date].drop(['date'], axis=1).values

## new_date_func
def ext_date(df, column):
    df[column] = pd.to_datetime(df[column],format = "%Y-%m-%d")
    df[column] = df[column].dt.strftime('%Y%m%d')

## Read_files_R.R
filelist = list.files(pattern = ".*.txt")
datalist = lapply(filelist, function(x)read.table(x, header=T))

#assuming the same header/columns for all files
datafr = do.call("rbind", datalist)

## PCA_variance_plot.py
import sklearn.decomposition
pca = sklearn.decomposition.PCA()
pca.fit(X)
variances = pca.explained_variance_ratio_
def select_n_components(var_ratio, goal_var: float) -> int:
    total_variance = 0.0
    n_components = 0
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        # Add the explained variance to the total

## ConfusionMatrix.py
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
	input_url = input("Enter the github url: ")
	y = input_url.replace('/blob','')
	x = y.replace('github.com','raw.githubusercontent.com')
	print("raw url: ", x )
	# Make a function to find the MSE of a single ARIMA model
	def evaluate_arima_model(data, arima_order):
	# Needs to be an integer because it is later used as an index.
	split=int(len(data) * 0.8)
	train, test = data[0:split], data[split:len(data)]
	past=[x for x in train]
	# make predictions
	predictions = list()
	for i in range(len(test)):#timestep-wise comparison between test data and one-step prediction ARIMA model.
	model = ARIMA(past, order=arima_order)
	import PIL
	from PIL import Image
	import os
	import sys

	path = "/path/to/file"

	dirs = [file for file in os.listdir(path) if file.endswith('.png')]

	#pixelsize = 500;
	def new_dt_split(date_col, X, y, input_date):
	date_col = pd.to_datetime(date_col)
	xw_date=pd.DataFrame(X).merge(date_col, left_index=True, right_index=True)
	X_train = xw_date.loc[xw_date['date'] <= input_date].drop(['date'], axis=1).values
	X_test = xw_date.loc[xw_date['date'] >= input_date].drop(['date'], axis=1).values
	yw_date=pd.DataFrame(y).merge(date_col, left_index=True, right_index=True)
	y_train = yw_date.loc[yw_date['date'] <= input_date].drop(['date'], axis=1).values
	y_test = yw_date.loc[yw_date['date'] >= input_date].drop(['date'], axis=1).values
	return X_train, X_test, y_train, y_test
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y,
	stratify=df['Country'],
	test_size=0.25)
	from datetime import datetime,timedelta
	def dt_splitter(date_col, X, y, test_size):
	xw_date=pd.DataFrame(X).merge(date_col,left_index=True, right_index=True)
	ad = (max(xw_date.date)- min(xw_date.date)).days*test_size
	split_date = min(xw_date.date) + timedelta(days=ad)
	X_train = xw_date.loc[xw_date['date'] <= split_date].drop(['date'], axis=1).values
	X_test = xw_date.loc[xw_date['date'] > split_date].drop(['date'], axis=1).values
	yw_date=pd.DataFrame(y).merge(date_col,left_index=True, right_index=True)
	y_train=yw_date.loc[yw_date['date'] <= split_date].drop(['date'], axis=1).values
	y_test=yw_date.loc[yw_date['date'] > split_date].drop(['date'], axis=1).values
	def ext_date(df, column):
	df[column] = pd.to_datetime(df[column],format = "%Y-%m-%d")
	df[column] = df[column].dt.strftime('%Y%m%d')
	filelist = list.files(pattern = ".*.txt")
	datalist = lapply(filelist, function(x)read.table(x, header=T))

	#assuming the same header/columns for all files
	datafr = do.call("rbind", datalist)
	import sklearn.decomposition
	pca = sklearn.decomposition.PCA()
	pca.fit(X)
	variances = pca.explained_variance_ratio_
	def select_n_components(var_ratio, goal_var: float) -> int:
	total_variance = 0.0
	n_components = 0
	# For the explained variance of each feature:
	for explained_variance in var_ratio:
	# Add the explained variance to the total
	import itertools
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn import svm, datasets
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix

	def plot_confusion_matrix(cm, classes,
	normalize=False,