Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@hugo1005
Created January 6, 2021 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hugo1005/71dc232b0316974206b07597cda8ded7 to your computer and use it in GitHub Desktop.
Save hugo1005/71dc232b0316974206b07597cda8ded7 to your computer and use it in GitHub Desktop.
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
model_df = pd.read_csv('./breakpoints.csv')
sectors = model_df['equity_sector'].unique()
industry = model_df['equity_industry'].unique()
n_sectors = len(sectors)
n_industries = len(industry)
n_quarters = int(max_quarter)
sector_map = {}
industry_map = {}
sector_industry_count = {}
industry_to_sector_map = {}
sector_to_industry_map = {i: [] for i in range(n_sectors)}
industry_idx = 0
sector_idx = 0
for sector in sectors:
unique_industries = model_df[model_df['equity_sector'] == sector]['equity_industry'].unique()
sector_map[sector_idx] = sector
sector_industry_count[sector_idx] = len(unique_industries)
for industry in unique_industries:
industry_map[industry_idx] = industry
industry_to_sector_map[industry_idx] = sector_idx
sector_to_industry_map[sector_idx].append(industry_idx)
industry_idx += 1
sector_idx += 1
sorted_sectors = pd.DataFrame({key: [val] for key, val in sector_map.items()}).T.values.reshape(-1,)
sorted_industries = pd.DataFrame({key: [val] for key, val in industry_map.items()}).T.values.reshape(-1,)
sector_map_inv = {val: key for key, val in sector_map.items()}
industry_map_inv = {val: key for key, val in industry_map.items()}
model_df['sector_idx'] = model_df['equity_sector'].replace(sector_map_inv)
model_df['industry_idx'] = model_df['equity_industry'].replace(industry_map_inv)
# We want a dataframe that looks like:
# industry_idx * observations
observed_industry_counts = {}
observed_sector_counts = {s:[0 for q in range(n_quarters)] for s in range(n_sectors)}
observed_market_counts = [0 for q in range(n_quarters)]
for i in range(n_industries):
counts = model_df[model_df['industry_idx'] == i]['quarter'].value_counts().sort_index()
observed_industry_counts[i] = []
for j in range(int(n_quarters)):
industry_count = counts[j] if j in counts.index else 0
observed_industry_counts[i].append(industry_count)
sector_idx = industry_to_sector_map[i]
observed_sector_counts[sector_idx][j] += industry_count
observed_market_counts[j] += industry_count
def compute_observed_exposed(observed_counts):
observed_counts_df = pd.DataFrame(observed_counts)
E_0_s = observed_counts_df.sum(axis=0) # Initial Exposed to risk
exposed_to_risk_df = E_0_s - observed_counts_df.cumsum(axis=0).shift(1).fillna(0)
central_exposed_to_risk_df = (exposed_to_risk_df.shift(1) + exposed_to_risk_df) * 0.5
central_exposed_to_risk_df.iloc[0,:] = exposed_to_risk_df.iloc[0,:]
return E_0_s, observed_counts_df, central_exposed_to_risk_df
# Industry Level
E_0_s_industry, observed_industry_counts_df, industry_exposed_to_risk_df = compute_observed_exposed(observed_industry_counts)
# Aggregated to sector level
E_0_s_sector, observed_sector_counts_df, sector_exposed_to_risk_df = compute_observed_exposed(observed_sector_counts)
# Aggregated to market level
E_0_s_market, observed_market_counts_df, market_exposed_to_risk_df = compute_observed_exposed(observed_market_counts)
E_0_s_market = E_0_s_market[0]
# Some crude estimates
crude_estimates = observed_sector_counts_df / sector_exposed_to_risk_df
crude_estimates.columns = [sector_map[s] for s in observed_sector_counts_df.columns]
crude_estimates['market'] = observed_market_counts_df / market_exposed_to_risk_df
crude_estimates.fillna(0).to_csv('crude_estimates.csv')
crude_estimates_ind = observed_industry_counts_df / industry_exposed_to_risk_df
crude_estimates_ind.columns = [industry_map[s] for s in observed_industry_counts_df.columns]
crude_estimates_ind.fillna(0).to_csv('crude_estimates_industry.csv')
import arviz
import seaborn as sns
def summary_fit(trace):
varnames = [var for var in trace.varnames if 'log__' not in var]
metrics = {}
for var in varnames:
res = arviz.hdi(trace[var], hdi_prob=0.94)
lower, upper = res[0], res[1]
mean = np.mean(trace[var])
metrics[var] = {'mean': mean, 'hdi_3': lower, 'hdi_97': upper}
return pd.DataFrame(metrics).T
# Specify and fit models for each quarter and output results
for q in tqdm(range(n_quarters)):
try:
with pm.Model() as management_model:
# So let X_q be the number management impacts in quarter q
# The X_q ~ Po(Exposed[Industry,q] * mu[industry,q])
# Lambda = (Number of companies * Probability of impact)
# Variance hyperparameter
sigma_m = pm.Beta('sigma_m',alpha=2,beta=5)
mean_m = pm.Beta('mean_m', alpha=2,beta=5)
# Model is independently estimated for each quarter observed
# But hierachially structured across markert -> sector -> industry
# Modelling Events at Market Level
# Since we have data on this we will use this as a likelihood
# Which will help shape our rates correctly and avoid
# hierachial "rate enveloping"
# Hazard Market Prior
hazard_rate_prior_m = pm.Gamma('mu_market_%s' % q,
mu = mean_m,
sigma = sigma_m)
# Data at Market Level
E_q_m = market_exposed_to_risk_df[0][q]
D_q_m = observed_market_counts_df[0][q]
# Market mean event count prior
lambda_m = hazard_rate_prior_m * E_q_m
# Market Likelihood
likelihood_market = pm.Poisson('events_market_' + str(q),
mu = lambda_m,
observed = D_q_m)
# Now we will use the Market Hazard Rate
# as a prior for the sector rates also
# Hazard Sector (Quarter q)
hazard_rates_sectors = [] #mus_sector
for sec_idx in range(n_sectors):
sector_name = '%s_%s' % (sector_map[sec_idx], q)
# mean_s = pm.Beta('mean_', + sector_name, alpha=2,beta=5)
sigma_s = pm.Beta('sigma_' + sector_name, alpha=2,beta=5)
# Hazard Sector Prior
hazard_rate_prior_s = pm.Gamma('mu_' + sector_name,
mu = mean_m,
sigma = sigma_s)
hazard_rates_sectors.append(hazard_rate_prior_s)
# Data at sector Level
E_q_s = sector_exposed_to_risk_df[sec_idx][q] # Exposed to risk
D_q_s = observed_sector_counts_df[sec_idx][q]
# Sector Mean Prior
lambda_s = hazard_rate_prior_s * E_q_s
# Sector Likelihood
likelihood_sector = pm.Poisson('events_' + sector_name,
mu = lambda_s,
observed = D_q_s)
# Variational Inference
advi_fit = pm.fit(method=pm.ADVI())
traces[q] = advi_fit
advi_trace = advi_fit.sample(2000)
results = summary_fit(advi_trace)
results.to_csv('./output/parameter_estimates_%s.csv' % q)
advi_elbo = pd.DataFrame(
{'log-ELBO': -np.log(advi_fit.hist),
'n': np.arange(advi_fit.hist.shape[0])})
advi_elbo.to_csv('./output/elbo_graph_%s.csv' % q)
except:
print("error occured in sampling: ", q)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment