Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / cluster.R
Last active May 8, 2021 16:10
Sorted linked list using factoextra's dist
library("factoextra",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("FactoMineR",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("reshape2",lib.loc = "/mnt/distvol/R-4.0.5/library")
library("data.table",lib.loc = "/mnt/distvol/R-4.0.5/library")
data <- read.csv("/mnt/distvol/pca_dist_scaled.csv", row.names=1)
data2 <- read.csv("/mnt/distvol/states.csv", row.names=1)
@thistleknot
thistleknot / box-cox-transform.py
Last active May 8, 2021 23:31
Box-Cox transforms
#power = PowerTransformer(method='box-cox')
def testNormal (x):
k2, p = stats.normaltest(x)
alpha = .001
#print("p = {:g}".format(p))
if p < alpha: # null hypothesis: x comes from a normal distribution
#print(p)
#print(alpha)
@thistleknot
thistleknot / KDE
Last active May 20, 2021 00:10
KDE Cumulative Distribution Function
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def make_data(N, f=0.3, rseed=1):
rand = np.random.RandomState(rseed)
x = rand.randn(N)
def f3(Y):
internalFilter = filter_.copy()
internalFilter.remove(Y)
all_data_ = pd.concat([all_data[Y],all_data[internalFilter]], axis=1)
display(all_data_.describe())
return(all_data_)
out = interactive(f3, Y=filter_)
@thistleknot
thistleknot / quantilRegressionBackwardsStep.py
Created May 29, 2021 10:55
Backwards Step Quantile Regression
import statsmodels.formula.api as smf
import statsmodels.regression.quantile_regression as srq
#no ZCA until the end
#convert to t from s
t_ = pd.DataFrame(StandardScaler().fit_transform(transformed_yj))
t_.columns = transformed.columns
t_.index = all_data.index
#from scipy.stats import chi
def split_sequences(sequences, n_steps_in, n_steps_out):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps_in
out_end_ix = end_ix + n_steps_out
# check if we are beyond the dataset
if out_end_ix > len(sequences):
break
@thistleknot
thistleknot / highlightRow.py
Created July 9, 2021 02:11
conditional highlight Rows
import imgkit
def highlight_greaterthan(x):
if abs(x['Anomaly']) >= .6:
return ['background-color: purple']*12
elif abs(x['Anomaly']) >= .55:
return ['background-color: red']*12
elif abs(x['Anomaly']) >= .5:
return ['background-color: yellow']*12
else:
@thistleknot
thistleknot / pcorr_significance.py
Last active December 4, 2021 01:54
Partial Correlation significance using kfolds
#notebook:
#https://github.com/thistleknot/python-ml/blob/master/code/pcorr-significance.ipynb
import pandas as pd
import numpy as np
from scipy import stats # For in-built method to get PCC
import scipy
from sklearn.model_selection import KFold
import pingouin as pg
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def fit_linear_reg(X,Y,train_i,test_i):
#Fit linear regression model and return RSS and R squared values
model_k = linear_model.LinearRegression(fit_intercept = True)
model_k.fit(X.iloc[train_i],Y.iloc[train_i])
RSS = mean_squared_error(Y.iloc[test_i],model_k.predict(X.iloc[test_i])) * len(Y)
R_squared = model_k.score(X.iloc[test_i],Y.iloc[test_i])
return RSS, R_squared
# ransac regression on a dataset with outliers
#https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
#https://github.com/HCGrit/MachineLearning-iamJustAStudent/blob/master/PipelineFoundation/Pipeline_Experiment.ipynb
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, RepeatedKFold, train_test_split