Skip to content

Instantly share code, notes, and snippets.

@CamDavidsonPilon
Last active January 2, 2019 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CamDavidsonPilon/779e8644915caaeb9fb6bff92a241146 to your computer and use it in GitHub Desktop.
Save CamDavidsonPilon/779e8644915caaeb9fb6bff92a241146 to your computer and use it in GitHub Desktop.
from time import time
import pandas as pd
import numpy as np
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter
# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH)
# N vs (% ties == unique(T) / N)
ROSSI_ROWS = 432
results = {}
for n_copies in [1, 5, 10, 20, 50]:
# lower percents means more ties.
# original rossi dataset has 0.113
for fraction in [0.05, 0.25, 0.5, 0.75, 0.95, 1.00]:
print(n_copies, fraction)
df = pd.concat([load_rossi()] * n_copies)
n_unique_durations = int(df.shape[0] * fraction) + 1
unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5)
df['week'] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[:df.shape[0]]
print(1.0 * df['week'].unique().shape[0] / df.shape[0])
cph_batch = CoxPHFitter(batch_mode=True)
start_time = time()
cph_batch.fit(df, 'week', 'arrest')
batch_time = time() - start_time
cph_single = CoxPHFitter(batch_mode=False)
start_time = time()
cph_single.fit(df, 'week', 'arrest')
single_time = time() - start_time
results[(n_copies * ROSSI_ROWS, fraction)] = {'batch': batch_time, 'single': single_time}
results = pd.DataFrame(results).T.sort_index()
results['ratio'] = results['batch'] / results['single']
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment