Skip to content

Instantly share code, notes, and snippets.

@dutc
Last active August 18, 2022 03:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dutc/f94619c10872a8aa6929e4e21295115d to your computer and use it in GitHub Desktop.
Save dutc/f94619c10872a8aa6929e4e21295115d to your computer and use it in GitHub Desktop.
Plotting Version Numbers
from pandas import MultiIndex, date_range, Series, Categorical, merge, get_dummies, date_range, DataFrame
from itertools import product
from numpy import arange, zeros, newaxis, tile, log2
from numpy.random import default_rng
from string import ascii_lowercase
from contextlib import contextmanager
from time import perf_counter
from matplotlib.cm import get_cmap
from matplotlib.pyplot import subplots, show
from sys import exit
import sys; sys.breakpointhook = exit
# from pandas import set_option; set_option('display.max_rows', None)
@contextmanager
def timed(msg):
try:
start = perf_counter()
yield
finally:
stop = perf_counter()
print(f'{msg:<36} \N{mathematical bold capital delta}t: {stop - start:.4f}s')
rng = default_rng(0)
entities = rng.choice([*ascii_lowercase], size=(25_000, 8)).view('<U8').ravel()
entities.sort()
entities = Categorical(entities)
dates = date_range('2020-01-01', periods=365 * 2)
versions = [*product(range(4, 5+1), range(20), range(20))]
versions = ['.'.join(map(str, versions[idx])) for idx in sorted(rng.choice(arange(len(versions)), size=48, replace=False))]
updates = zeros(shape=len(dates) * 4, dtype=int)
updates[:len(versions) - 1] = 1
if False:
with timed('.from_product'):
idx = MultiIndex.from_product([entities, dates], names=['entity', 'date'], sortorder=0)
with timed('Series'):
s = Series(index=idx, data=0).groupby('entity').transform(
lambda g: updates[rng.choice(arange(len(updates)), size=len(updates), replace=False)].cumsum()[::4],
)
s = Series(versions, dtype='category').loc[s.values].set_axis(s.index)
# with timed('.sort_index'):
# s = s.sort_index()
with timed('.sample'):
ss = s.sample(frac=len(versions) / len(dates), random_state=rng).rename('sampled').sort_index()
del idx, s
with timed('.from_product'):
idx = MultiIndex.from_product([
ss.index.levels[0],
date_range(ss.index.get_level_values('date').min(), ss.index.get_level_values('date').max()),
], names=['entity', 'date'], sortorder=0)
with timed('assign'):
rv = Series(index=idx, data=None, dtype=ss.dtype).sort_index()
rv.loc[ss.index] = ss.values
with timed('.groupby.ffill'):
rv = rv.groupby(['entity']).ffill()
rv = rv.dropna()
# rv = rv.groupby(['entity']).bfill()
with timed('get_dummies'):
rv = get_dummies(rv)
with timed('.groupby.sum'):
rv = rv.groupby('date').sum().apply(
lambda s: s / s.sum(),
axis='columns'
)
rv.to_pickle('/tmp/rv.pkl')
else:
from pandas import read_pickle
rv = read_pickle('/tmp/rv.pkl')
if False:
print(
'\n'.join([
# f'{s.size = :,}',
f'{ss.size = :,}',
f'{rv.size = :,}',
]),
# s.head(10),
ss.head(10),
rv.head(10),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
yaxis = rv.index[::-len(rv.index) // 24]
xaxis = sorted(rv.columns, key=lambda x: (*map(int, x.split('.')),))
with timed('entropy'):
entropy =(
rv.apply(
lambda s: s.pipe(lambda s: s[s > 0]).pipe(lambda s: -(s * log2(s))).sum(),
axis='columns'
).iloc[::-1]
)
with timed('deployed'):
p99 = DataFrame({
idx: rv[col]
.pipe(lambda s: s[s > 0.01])
.pipe(lambda s: (
s.index[0],
s.index[-1],
))
for idx, col in enumerate(xaxis)
}, index=['first', 'last']).T.applymap(
rv.index[::-1].get_loc
) / len(rv.index) * len(yaxis)
p90 = DataFrame({
idx: rv[col]
.pipe(lambda s: s[s > 0.10])
.pipe(lambda s: (
s.index[0],
s.index[-1],
))
for idx, col in enumerate(xaxis)
}, index=['first', 'last']).T.applymap(
rv.index[::-1].get_loc
) / len(rv.index) * len(yaxis)
fig, [main, side] = subplots(
ncols=2,
nrows=1,
gridspec_kw={'wspace': 0, 'width_ratios': [5, 1]},
sharey=True,
)
side.plot(
entropy.values,
arange(len(entropy.index)) * len(yaxis) / len(entropy.index),
)
side.set_position([
main.get_position().x1,
side.get_position().y0 +
(side.get_position().y1 - side.get_position().y0) / 3.15,
side.get_position().x1 - side.get_position().x0,
(side.get_position().y1 - side.get_position().y0) / 2.75,
])
side.axis('off')
main.spines['top'].set_visible(False)
main.spines['right'].set_visible(False)
main.set_xticks(arange(len(xaxis)))
main.set_yticks(arange(len(yaxis)))
main.set_xticklabels(xaxis, rotation=45, ha='right')
main.set_yticklabels(yaxis.strftime('%Y-%b'), ha='right')
for idx, col in enumerate(xaxis):
main.imshow(
rv[col].values[..., newaxis],
extent=(idx - .5, idx + .5, 0, len(yaxis)),
cmap='Blues',
)
for deployed, alpha, color in [(p90, .25, get_cmap('Blues')(0)), (p99, .15, get_cmap('Blues')(float('inf')))]:
main.hlines(
deployed['first'].values,
deployed['first'].index - .5,
deployed['first'].index + .5,
linestyle='dotted',
color=color, alpha=alpha,
)
main.hlines(
deployed['last'].values,
deployed['last'].index - .5,
deployed['last'].index + .5,
linestyle='dotted',
color=color, alpha=alpha,
)
show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment