Created
January 6, 2021 14:03
-
-
Save hugo1005/71dc232b0316974206b07597cda8ded7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pymc3 as pm | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from tqdm import tqdm | |
model_df = pd.read_csv('./breakpoints.csv') | |
sectors = model_df['equity_sector'].unique() | |
industry = model_df['equity_industry'].unique() | |
n_sectors = len(sectors) | |
n_industries = len(industry) | |
n_quarters = int(max_quarter) | |
sector_map = {} | |
industry_map = {} | |
sector_industry_count = {} | |
industry_to_sector_map = {} | |
sector_to_industry_map = {i: [] for i in range(n_sectors)} | |
industry_idx = 0 | |
sector_idx = 0 | |
for sector in sectors: | |
unique_industries = model_df[model_df['equity_sector'] == sector]['equity_industry'].unique() | |
sector_map[sector_idx] = sector | |
sector_industry_count[sector_idx] = len(unique_industries) | |
for industry in unique_industries: | |
industry_map[industry_idx] = industry | |
industry_to_sector_map[industry_idx] = sector_idx | |
sector_to_industry_map[sector_idx].append(industry_idx) | |
industry_idx += 1 | |
sector_idx += 1 | |
sorted_sectors = pd.DataFrame({key: [val] for key, val in sector_map.items()}).T.values.reshape(-1,) | |
sorted_industries = pd.DataFrame({key: [val] for key, val in industry_map.items()}).T.values.reshape(-1,) | |
sector_map_inv = {val: key for key, val in sector_map.items()} | |
industry_map_inv = {val: key for key, val in industry_map.items()} | |
model_df['sector_idx'] = model_df['equity_sector'].replace(sector_map_inv) | |
model_df['industry_idx'] = model_df['equity_industry'].replace(industry_map_inv) | |
# We want a dataframe that looks like: | |
# industry_idx * observations | |
observed_industry_counts = {} | |
observed_sector_counts = {s:[0 for q in range(n_quarters)] for s in range(n_sectors)} | |
observed_market_counts = [0 for q in range(n_quarters)] | |
for i in range(n_industries): | |
counts = model_df[model_df['industry_idx'] == i]['quarter'].value_counts().sort_index() | |
observed_industry_counts[i] = [] | |
for j in range(int(n_quarters)): | |
industry_count = counts[j] if j in counts.index else 0 | |
observed_industry_counts[i].append(industry_count) | |
sector_idx = industry_to_sector_map[i] | |
observed_sector_counts[sector_idx][j] += industry_count | |
observed_market_counts[j] += industry_count | |
def compute_observed_exposed(observed_counts): | |
observed_counts_df = pd.DataFrame(observed_counts) | |
E_0_s = observed_counts_df.sum(axis=0) # Initial Exposed to risk | |
exposed_to_risk_df = E_0_s - observed_counts_df.cumsum(axis=0).shift(1).fillna(0) | |
central_exposed_to_risk_df = (exposed_to_risk_df.shift(1) + exposed_to_risk_df) * 0.5 | |
central_exposed_to_risk_df.iloc[0,:] = exposed_to_risk_df.iloc[0,:] | |
return E_0_s, observed_counts_df, central_exposed_to_risk_df | |
# Industry Level | |
E_0_s_industry, observed_industry_counts_df, industry_exposed_to_risk_df = compute_observed_exposed(observed_industry_counts) | |
# Aggregated to sector level | |
E_0_s_sector, observed_sector_counts_df, sector_exposed_to_risk_df = compute_observed_exposed(observed_sector_counts) | |
# Aggregated to market level | |
E_0_s_market, observed_market_counts_df, market_exposed_to_risk_df = compute_observed_exposed(observed_market_counts) | |
E_0_s_market = E_0_s_market[0] | |
# Some crude estimates | |
crude_estimates = observed_sector_counts_df / sector_exposed_to_risk_df | |
crude_estimates.columns = [sector_map[s] for s in observed_sector_counts_df.columns] | |
crude_estimates['market'] = observed_market_counts_df / market_exposed_to_risk_df | |
crude_estimates.fillna(0).to_csv('crude_estimates.csv') | |
crude_estimates_ind = observed_industry_counts_df / industry_exposed_to_risk_df | |
crude_estimates_ind.columns = [industry_map[s] for s in observed_industry_counts_df.columns] | |
crude_estimates_ind.fillna(0).to_csv('crude_estimates_industry.csv') | |
import arviz | |
import seaborn as sns | |
def summary_fit(trace): | |
varnames = [var for var in trace.varnames if 'log__' not in var] | |
metrics = {} | |
for var in varnames: | |
res = arviz.hdi(trace[var], hdi_prob=0.94) | |
lower, upper = res[0], res[1] | |
mean = np.mean(trace[var]) | |
metrics[var] = {'mean': mean, 'hdi_3': lower, 'hdi_97': upper} | |
return pd.DataFrame(metrics).T | |
# Specify and fit models for each quarter and output results | |
for q in tqdm(range(n_quarters)): | |
try: | |
with pm.Model() as management_model: | |
# So let X_q be the number management impacts in quarter q | |
# The X_q ~ Po(Exposed[Industry,q] * mu[industry,q]) | |
# Lambda = (Number of companies * Probability of impact) | |
# Variance hyperparameter | |
sigma_m = pm.Beta('sigma_m',alpha=2,beta=5) | |
mean_m = pm.Beta('mean_m', alpha=2,beta=5) | |
# Model is independently estimated for each quarter observed | |
# But hierachially structured across markert -> sector -> industry | |
# Modelling Events at Market Level | |
# Since we have data on this we will use this as a likelihood | |
# Which will help shape our rates correctly and avoid | |
# hierachial "rate enveloping" | |
# Hazard Market Prior | |
hazard_rate_prior_m = pm.Gamma('mu_market_%s' % q, | |
mu = mean_m, | |
sigma = sigma_m) | |
# Data at Market Level | |
E_q_m = market_exposed_to_risk_df[0][q] | |
D_q_m = observed_market_counts_df[0][q] | |
# Market mean event count prior | |
lambda_m = hazard_rate_prior_m * E_q_m | |
# Market Likelihood | |
likelihood_market = pm.Poisson('events_market_' + str(q), | |
mu = lambda_m, | |
observed = D_q_m) | |
# Now we will use the Market Hazard Rate | |
# as a prior for the sector rates also | |
# Hazard Sector (Quarter q) | |
hazard_rates_sectors = [] #mus_sector | |
for sec_idx in range(n_sectors): | |
sector_name = '%s_%s' % (sector_map[sec_idx], q) | |
# mean_s = pm.Beta('mean_', + sector_name, alpha=2,beta=5) | |
sigma_s = pm.Beta('sigma_' + sector_name, alpha=2,beta=5) | |
# Hazard Sector Prior | |
hazard_rate_prior_s = pm.Gamma('mu_' + sector_name, | |
mu = mean_m, | |
sigma = sigma_s) | |
hazard_rates_sectors.append(hazard_rate_prior_s) | |
# Data at sector Level | |
E_q_s = sector_exposed_to_risk_df[sec_idx][q] # Exposed to risk | |
D_q_s = observed_sector_counts_df[sec_idx][q] | |
# Sector Mean Prior | |
lambda_s = hazard_rate_prior_s * E_q_s | |
# Sector Likelihood | |
likelihood_sector = pm.Poisson('events_' + sector_name, | |
mu = lambda_s, | |
observed = D_q_s) | |
# Variational Inference | |
advi_fit = pm.fit(method=pm.ADVI()) | |
traces[q] = advi_fit | |
advi_trace = advi_fit.sample(2000) | |
results = summary_fit(advi_trace) | |
results.to_csv('./output/parameter_estimates_%s.csv' % q) | |
advi_elbo = pd.DataFrame( | |
{'log-ELBO': -np.log(advi_fit.hist), | |
'n': np.arange(advi_fit.hist.shape[0])}) | |
advi_elbo.to_csv('./output/elbo_graph_%s.csv' % q) | |
except: | |
print("error occured in sampling: ", q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment