Turning out data tricks since 2006! thistleknot

## cluster.R

library("factoextra",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("FactoMineR",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("reshape2",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("data.table",lib.loc = "/mnt/distvol/R-4.0.5/library")

data <- read.csv("/mnt/distvol/pca_dist_scaled.csv", row.names=1)

data2 <- read.csv("/mnt/distvol/states.csv", row.names=1)

## box-cox-transform.py
#power = PowerTransformer(method='box-cox')

def testNormal (x):

    k2, p = stats.normaltest(x)
    alpha = .001
    #print("p = {:g}".format(p))
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        #print(p)
        #print(alpha)

## KDE
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def make_data(N, f=0.3, rseed=1):
    rand = np.random.RandomState(rseed)
    x = rand.randn(N)

## gist:cb8831c08bd2f123158032c2199cd271
def f3(Y):

    internalFilter = filter_.copy()
    internalFilter.remove(Y)
    all_data_ = pd.concat([all_data[Y],all_data[internalFilter]], axis=1)
    display(all_data_.describe())
    return(all_data_)

out = interactive(f3, Y=filter_)

## quantilRegressionBackwardsStep.py
import statsmodels.formula.api as smf
import statsmodels.regression.quantile_regression as srq

#no ZCA until the end

#convert to t from s
t_ = pd.DataFrame(StandardScaler().fit_transform(transformed_yj))
t_.columns = transformed.columns
t_.index = all_data.index
#from scipy.stats import chi

## tensor.py
def split_sequences(sequences, n_steps_in, n_steps_out):

  X, y = list(), list()
  for i in range(len(sequences)):
    # find the end of this pattern
    end_ix = i + n_steps_in
    out_end_ix = end_ix + n_steps_out
    # check if we are beyond the dataset
    if out_end_ix > len(sequences):
      break

## highlightRow.py
import imgkit

def highlight_greaterthan(x):
    if abs(x['Anomaly']) >= .6:
        return ['background-color: purple']*12
    elif abs(x['Anomaly']) >= .55:
        return ['background-color: red']*12
    elif abs(x['Anomaly']) >= .5:
        return ['background-color: yellow']*12
    else:

## pcorr_significance.py
#notebook:
#https://github.com/thistleknot/python-ml/blob/master/code/pcorr-significance.ipynb
import pandas as pd
import numpy as np
from scipy import stats # For in-built method to get PCC
import scipy
from sklearn.model_selection import KFold
import pingouin as pg
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

## bestSubsets.py
def fit_linear_reg(X,Y,train_i,test_i):
    #Fit linear regression model and return RSS and R squared values
    model_k = linear_model.LinearRegression(fit_intercept = True)
    model_k.fit(X.iloc[train_i],Y.iloc[train_i])
    RSS = mean_squared_error(Y.iloc[test_i],model_k.predict(X.iloc[test_i])) * len(Y)
    R_squared = model_k.score(X.iloc[test_i],Y.iloc[test_i])
    return RSS, R_squared

# ransac regression on a dataset with outliers

## standardScaler_YeoJohnson_Transformer.py
#https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
#https://github.com/HCGrit/MachineLearning-iamJustAStudent/blob/master/PipelineFoundation/Pipeline_Experiment.ipynb

import pandas as pd

from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, RepeatedKFold, train_test_split

	library("factoextra",lib.loc = "/mnt/distvol/R-4.0.5/library")
	library("FactoMineR",lib.loc = "/mnt/distvol/R-4.0.5/library")
	library("reshape2",lib.loc = "/mnt/distvol/R-4.0.5/library")
	library("data.table",lib.loc = "/mnt/distvol/R-4.0.5/library")

	data <- read.csv("/mnt/distvol/pca_dist_scaled.csv", row.names=1)

	data2 <- read.csv("/mnt/distvol/states.csv", row.names=1)
	#power = PowerTransformer(method='box-cox')

	def testNormal (x):

	k2, p = stats.normaltest(x)
	alpha = .001
	#print("p = {:g}".format(p))
	if p < alpha: # null hypothesis: x comes from a normal distribution
	#print(p)
	#print(alpha)
	from sklearn.neighbors import KernelDensity
	from sklearn.model_selection import GridSearchCV
	from sklearn.model_selection import LeaveOneOut
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	def make_data(N, f=0.3, rseed=1):
	rand = np.random.RandomState(rseed)
	x = rand.randn(N)
	def f3(Y):

	internalFilter = filter_.copy()
	internalFilter.remove(Y)
	all_data_ = pd.concat([all_data[Y],all_data[internalFilter]], axis=1)
	display(all_data_.describe())
	return(all_data_)

	out = interactive(f3, Y=filter_)
	import statsmodels.formula.api as smf
	import statsmodels.regression.quantile_regression as srq

	#no ZCA until the end

	#convert to t from s
	t_ = pd.DataFrame(StandardScaler().fit_transform(transformed_yj))
	t_.columns = transformed.columns
	t_.index = all_data.index
	#from scipy.stats import chi
	def split_sequences(sequences, n_steps_in, n_steps_out):

	X, y = list(), list()
	for i in range(len(sequences)):
	# find the end of this pattern
	end_ix = i + n_steps_in
	out_end_ix = end_ix + n_steps_out
	# check if we are beyond the dataset
	if out_end_ix > len(sequences):
	break
	import imgkit

	def highlight_greaterthan(x):
	if abs(x['Anomaly']) >= .6:
	return ['background-color: purple']*12
	elif abs(x['Anomaly']) >= .55:
	return ['background-color: red']*12
	elif abs(x['Anomaly']) >= .5:
	return ['background-color: yellow']*12
	else:
	#notebook:
	#https://github.com/thistleknot/python-ml/blob/master/code/pcorr-significance.ipynb
	import pandas as pd
	import numpy as np
	from scipy import stats # For in-built method to get PCC
	import scipy
	from sklearn.model_selection import KFold
	import pingouin as pg
	from sklearn.preprocessing import StandardScaler
	scaler = StandardScaler()
	def fit_linear_reg(X,Y,train_i,test_i):
	#Fit linear regression model and return RSS and R squared values
	model_k = linear_model.LinearRegression(fit_intercept = True)
	model_k.fit(X.iloc[train_i],Y.iloc[train_i])
	RSS = mean_squared_error(Y.iloc[test_i],model_k.predict(X.iloc[test_i])) * len(Y)
	R_squared = model_k.score(X.iloc[test_i],Y.iloc[test_i])
	return RSS, R_squared

	# ransac regression on a dataset with outliers
	#https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
	#https://github.com/HCGrit/MachineLearning-iamJustAStudent/blob/master/PipelineFoundation/Pipeline_Experiment.ipynb

	import pandas as pd

	from sklearn.compose import TransformedTargetRegressor
	from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, RepeatedKFold, train_test_split