Skip to content

Instantly share code, notes, and snippets.

View jaradc's full-sized avatar

jaradc

View GitHub Profile
@jaradc
jaradc / entropy_calculation_in_python.py
Last active April 3, 2024 18:16
Four different ways to calculate entropy in Python
import numpy as np
from scipy.stats import entropy
from math import log, e
import pandas as pd
import timeit
def entropy1(labels, base=None):
value,counts = np.unique(labels, return_counts=True)
return entropy(counts, base=base)
@jaradc
jaradc / sklearn_custom_scorer_example.py
Last active October 6, 2017 17:52
Create a custom scorer in sklearn
def specificity_scorer(y_true, y_pred):
tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
return tn / (tn+fp)
specificity = metrics.make_scorer(specificity_scorer, greater_is_better=True)
# use in cross_val_score
cv = cross_val_score(xgb, dt.features_.values, y, scoring=specificity, cv=5, verbose=2)
# use in GridSearchCV
@jaradc
jaradc / binomial_hypothesis_tests.py
Last active October 11, 2017 23:00
Test if two binomial distributions are statistically different from each other
import statsmodels.api as sm
import numpy as np
import rpy2.robjects.packages as rpackages
import rpy2.robjects as robjects
rstats = rpackages.importr('stats')
s1 = 1556
n1 = 2455
s2 = 1671
@jaradc
jaradc / convert_correlation_matrix_to_table.py
Created November 2, 2017 19:23
Shows how to turn a correlation matrix into a table
>>> corr
Impressions Clicks CTR Avg. CPC Cost Avg. position \
Impressions 1.000000 0.599646 NaN 0.301556 0.568137 0.197353
Clicks 0.599646 1.000000 NaN 0.566357 0.987073 0.627268
CTR NaN NaN NaN NaN NaN NaN
Avg. CPC 0.301556 0.566357 NaN 1.000000 0.663789 0.809944
Cost 0.568137 0.987073 NaN 0.663789 1.000000 0.707918
Avg. position 0.197353 0.627268 NaN 0.809944 0.707918 1.000000
Conversions 0.558450 0.927165 NaN 0.717237 0.962034 0.746493
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
'''
timeseries has a date index and a single column of numeric values
'''
#Determing rolling statistics
@jaradc
jaradc / good_example_of_fillna_inplace_not_working.py
Created November 22, 2017 04:57
This is a good example of inplace not working as-expected
df = pd.DataFrame(
{'Label': [np.nan, np.nan, 'Label1', 'Label2'],
'URL': ['https://www.website.com/where-to-buy',
np.nan, np.nan, 'https://www.website.com/store']
}, columns=['Label', 'URL'])
# this does not actually replace inplace!
df[['Label', 'URL']].fillna('', inplace=True)
# you have to assign it or use a dict
@jaradc
jaradc / random_colormap.py
Last active November 30, 2017 05:56 — forked from jgomezdans/random.py
Random colormap for matplotlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# A random colormap for matplotlib
rand_cmap = ListedColormap(np.random.rand(256,3))
for i in range(5):
plt.scatter([1,2,3], np.random.randn(3), s=10, cmap=rand_cmap)
@jaradc
jaradc / curve_fitting_like_excel.py
Created December 1, 2017 21:11
Curve fitting for Polynomial, Logarithmic, and Power
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
df = pd.DataFrame({
'y': [0.996559203, 0.99161362, 0.9925214090000001, 0.986498352,
0.9826329420000001, 0.977550635, 0.9542758440000001, 0.941359915,
0.933388103, 0.929990698, 0.920058004, 0.90789857, 0.909764261,
0.8944469829999999, 0.912682288, 0.913135466, 0.913485262,
@jaradc
jaradc / make_moving_avg_plot.py
Last active December 21, 2017 09:44
This lets you create subplots of n rows and 1 column of a metric within a time series.
def ma_subplots(df, window, title=None):
fig, ax = plt.subplots(df.shape[1], 1, figsize=(12, 7))
ax = ax.ravel()
for i,col in enumerate(df):
mean = df[col].mean()
ma = df[col].rolling(window).mean()
mstd = df[col].rolling(window).std()
ax[i].plot(df.index, df[col], color='k', label=col)
ax[i].plot(ma.index, ma, 'b', label='Moving Avg.')
ax[i].fill_between(mstd.index, ma - 2*mstd, ma + 2*mstd, color='b', alpha=0.2)
@jaradc
jaradc / scale_between_two_numbers.py
Created January 5, 2018 21:36
This function scales a numpy array between two values (ex: 0.05, 1.25)
def scale_between(series, min_amt, max_amt):
series_min = series.min()
series_max = series.max()
return (((max_amt - min_amt)*(series - series_min)) / (series_max - series_min)) + min_amt