Last active
August 18, 2022 03:38
-
-
Save dutc/f94619c10872a8aa6929e4e21295115d to your computer and use it in GitHub Desktop.
Plotting Version Numbers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import MultiIndex, date_range, Series, Categorical, merge, get_dummies, date_range, DataFrame | |
from itertools import product | |
from numpy import arange, zeros, newaxis, tile, log2 | |
from numpy.random import default_rng | |
from string import ascii_lowercase | |
from contextlib import contextmanager | |
from time import perf_counter | |
from matplotlib.cm import get_cmap | |
from matplotlib.pyplot import subplots, show | |
from sys import exit | |
import sys; sys.breakpointhook = exit | |
# from pandas import set_option; set_option('display.max_rows', None) | |
@contextmanager | |
def timed(msg): | |
try: | |
start = perf_counter() | |
yield | |
finally: | |
stop = perf_counter() | |
print(f'{msg:<36} \N{mathematical bold capital delta}t: {stop - start:.4f}s') | |
rng = default_rng(0) | |
entities = rng.choice([*ascii_lowercase], size=(25_000, 8)).view('<U8').ravel() | |
entities.sort() | |
entities = Categorical(entities) | |
dates = date_range('2020-01-01', periods=365 * 2) | |
versions = [*product(range(4, 5+1), range(20), range(20))] | |
versions = ['.'.join(map(str, versions[idx])) for idx in sorted(rng.choice(arange(len(versions)), size=48, replace=False))] | |
updates = zeros(shape=len(dates) * 4, dtype=int) | |
updates[:len(versions) - 1] = 1 | |
if False: | |
with timed('.from_product'): | |
idx = MultiIndex.from_product([entities, dates], names=['entity', 'date'], sortorder=0) | |
with timed('Series'): | |
s = Series(index=idx, data=0).groupby('entity').transform( | |
lambda g: updates[rng.choice(arange(len(updates)), size=len(updates), replace=False)].cumsum()[::4], | |
) | |
s = Series(versions, dtype='category').loc[s.values].set_axis(s.index) | |
# with timed('.sort_index'): | |
# s = s.sort_index() | |
with timed('.sample'): | |
ss = s.sample(frac=len(versions) / len(dates), random_state=rng).rename('sampled').sort_index() | |
del idx, s | |
with timed('.from_product'): | |
idx = MultiIndex.from_product([ | |
ss.index.levels[0], | |
date_range(ss.index.get_level_values('date').min(), ss.index.get_level_values('date').max()), | |
], names=['entity', 'date'], sortorder=0) | |
with timed('assign'): | |
rv = Series(index=idx, data=None, dtype=ss.dtype).sort_index() | |
rv.loc[ss.index] = ss.values | |
with timed('.groupby.ffill'): | |
rv = rv.groupby(['entity']).ffill() | |
rv = rv.dropna() | |
# rv = rv.groupby(['entity']).bfill() | |
with timed('get_dummies'): | |
rv = get_dummies(rv) | |
with timed('.groupby.sum'): | |
rv = rv.groupby('date').sum().apply( | |
lambda s: s / s.sum(), | |
axis='columns' | |
) | |
rv.to_pickle('/tmp/rv.pkl') | |
else: | |
from pandas import read_pickle | |
rv = read_pickle('/tmp/rv.pkl') | |
if False: | |
print( | |
'\n'.join([ | |
# f'{s.size = :,}', | |
f'{ss.size = :,}', | |
f'{rv.size = :,}', | |
]), | |
# s.head(10), | |
ss.head(10), | |
rv.head(10), | |
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40), | |
) | |
yaxis = rv.index[::-len(rv.index) // 24] | |
xaxis = sorted(rv.columns, key=lambda x: (*map(int, x.split('.')),)) | |
with timed('entropy'): | |
entropy =( | |
rv.apply( | |
lambda s: s.pipe(lambda s: s[s > 0]).pipe(lambda s: -(s * log2(s))).sum(), | |
axis='columns' | |
).iloc[::-1] | |
) | |
with timed('deployed'): | |
p99 = DataFrame({ | |
idx: rv[col] | |
.pipe(lambda s: s[s > 0.01]) | |
.pipe(lambda s: ( | |
s.index[0], | |
s.index[-1], | |
)) | |
for idx, col in enumerate(xaxis) | |
}, index=['first', 'last']).T.applymap( | |
rv.index[::-1].get_loc | |
) / len(rv.index) * len(yaxis) | |
p90 = DataFrame({ | |
idx: rv[col] | |
.pipe(lambda s: s[s > 0.10]) | |
.pipe(lambda s: ( | |
s.index[0], | |
s.index[-1], | |
)) | |
for idx, col in enumerate(xaxis) | |
}, index=['first', 'last']).T.applymap( | |
rv.index[::-1].get_loc | |
) / len(rv.index) * len(yaxis) | |
fig, [main, side] = subplots( | |
ncols=2, | |
nrows=1, | |
gridspec_kw={'wspace': 0, 'width_ratios': [5, 1]}, | |
sharey=True, | |
) | |
side.plot( | |
entropy.values, | |
arange(len(entropy.index)) * len(yaxis) / len(entropy.index), | |
) | |
side.set_position([ | |
main.get_position().x1, | |
side.get_position().y0 + | |
(side.get_position().y1 - side.get_position().y0) / 3.15, | |
side.get_position().x1 - side.get_position().x0, | |
(side.get_position().y1 - side.get_position().y0) / 2.75, | |
]) | |
side.axis('off') | |
main.spines['top'].set_visible(False) | |
main.spines['right'].set_visible(False) | |
main.set_xticks(arange(len(xaxis))) | |
main.set_yticks(arange(len(yaxis))) | |
main.set_xticklabels(xaxis, rotation=45, ha='right') | |
main.set_yticklabels(yaxis.strftime('%Y-%b'), ha='right') | |
for idx, col in enumerate(xaxis): | |
main.imshow( | |
rv[col].values[..., newaxis], | |
extent=(idx - .5, idx + .5, 0, len(yaxis)), | |
cmap='Blues', | |
) | |
for deployed, alpha, color in [(p90, .25, get_cmap('Blues')(0)), (p99, .15, get_cmap('Blues')(float('inf')))]: | |
main.hlines( | |
deployed['first'].values, | |
deployed['first'].index - .5, | |
deployed['first'].index + .5, | |
linestyle='dotted', | |
color=color, alpha=alpha, | |
) | |
main.hlines( | |
deployed['last'].values, | |
deployed['last'].index - .5, | |
deployed['last'].index + .5, | |
linestyle='dotted', | |
color=color, alpha=alpha, | |
) | |
show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment