Skip to content

Instantly share code, notes, and snippets.

View bbstats's full-sized avatar

Nathan Walker bbstats

View GitHub Profile
import numpy as np
from numba import njit, prange
from sklearn.base import BaseEstimator
from sklearn.metrics import log_loss, mean_squared_error
import time
import warnings
warnings.filterwarnings('ignore')
# --- OPTIMIZED KERNELS (NO LOGIC CHANGES) ---
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import check_cv
from sklearn.utils.validation import check_X_y, check_array
class RidgeCVPlus(BaseEstimator, RegressorMixin):
def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, scoring=None,
cv=None, gcv_mode=None, store_cv_values=False, alpha_per_target=False,
preprocessor=None):
@bbstats
bbstats / parallelizer.py
Last active October 3, 2023 14:06
joblib Parallel which takes in an iteration of tuples and returns a dict which keeps the same keys
from typing import List, Tuple
from functools import partial
import joblib
class Parallelizer:
def __init__(self, fn, fn_args: List, fn_kwargs: dict = None, parallel_kwargs: dict = None):
self.fn = fn
# this fn must take as its final positional argument a (k,v) tuple
# and return a (k, result) tuple
@bbstats
bbstats / quantile_regressor_cv.py
Created June 14, 2023 19:25
subclassed QuantileRegressor with auto CV added (by GPT-4)
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import QuantileRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_pinball_loss
class QuantileRegressorCV(BaseEstimator, RegressorMixin):
def __init__(self, alphas, quantile=0.5, n_splits=5, random_state=None, solver='highs', max_iter=1000, tol=1e-4):
self.alphas = alphas
@bbstats
bbstats / basketball_reference_toy_data.csv
Created August 23, 2022 13:49
basketball reference toy data
We can't make this file beautiful and searchable because it's too large.
Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,adj_fta/tsa,2pa/tsa,3pa/tsa,starter,MPG
1,Mahmoud Abdul-Rauf,PG,31,VAN,41,0,486,12.9,26.5,0.488,0.4,1.5,0.286,12.5,25.0,0.5,2.4,3.1,0.759,0.5,2.2,2.7,8.2,1.0,0.1,2.8,5.4,28.7,104.0,111.0,0.04895205282802182,0.8972150445018662,0.05383290267011197,0,11.853658536585366
2,Tariq Abdul-Wahad,SG,26,DEN,29,12,420,5.3,13.7,0.387,0.5,1.2,0.4,4.8,12.5,0.386,2.6,4.4,0.583,1.7,5.6,7.3,2.7,1.7,1.6,4.2,6.7,13.7,79.0,106.0,0.1238168329496035,0.7994371962138654,0.07674597083653108,0,14.482758620689655
3,Shareef Abdur-Rahim,SF,24,VAN,81,81,3241,9.8,20.7,0.472,0.2,1.0,0.188,9.6,19.6,0.487,7.2,8.6,0.834,2.8,9.0,11.9,4.0,1.5,1.2,3.7,3.8,26.9,106.0,104.0,0.1545499101454011,0.8005227903937266,0.04084299950988401,1,40.01234567901235
4,Cory Alexander,PG,27,ORL,26,0,227,4.1,12.7,0.321,0.9,3.6,0.25,3.2,9.1,0.35,2.7,4.1,0.667,0.0,5.7,5.7,8.2,3.6,0.0,5.7,6.6,11.8,77.0,100.0,0.12437948152233866,0.6274131274131274,0.2482073
@bbstats
bbstats / weighted_chaterjee.py
Last active April 29, 2022 16:07
weighted_chaterjee.py
from scipy.stats import rankdata
import numpy as np
def weighted_chaterjee_correlation(x, y, sample_weight=None):
"""x and y must be 1d, probably"""
n = x.size
rk_x = rankdata(x, method="average")
rk_y = rankdata(y, method="average")
@bbstats
bbstats / lr_dt.py
Created March 8, 2022 06:16
linear decision trees (to be used as a base estimator)
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
class LinearDecisionTreeRegressor(RegressorMixin, BaseEstimator):
def __init__(self, lr_params=None, dt_params=None):
self.lr_params = {} if lr_params is None else lr_params
self.dt_params = {} if dt_params is None else dt_params
self.lr = LinearRegression(**self.lr_params)
@bbstats
bbstats / modeldata.py
Created January 5, 2022 18:40
ModelData class allows for significantly easier and robust data science-related tabular data organization
from dataclasses import dataclass
from typing import List
import pandas as pd
@dataclass
class ModelData:
dataframe: pd.DataFrame
features: List
target_col: str
@bbstats
bbstats / hebo_patch.py
Created January 4, 2022 21:25
Simple patch of HEBO to support sample weights (via fit_params argument)
# requirements to pip install:
#
# numpy
# pandas
# sklearn
# HEBO
#
#
import numpy as np
import pandas as pd
@bbstats
bbstats / out_of_bag_search.py
Last active August 14, 2021 18:03
HPO using out of bag scores rather than cross-validation
from sklearn.base import clone
from sklearn.model_selection._search import ParameterSampler
def py_argmax(array):
return array.index(max(array))
class RandomizedSearchOOB:
def __init__(self, estimator, param_distributions, n_iter=10, n_jobs=None, refit=True, random_state=None):
"""RandomizedSearchCV replacement for any sklearn estimator which has out of bag scores:
RandomForestRegressor, RandomForestClassifier, BaggingRegressor, BaggingClassifier"""