Last active
January 2, 2019 19:33
-
-
Save CamDavidsonPilon/779e8644915caaeb9fb6bff92a241146 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import time | |
import pandas as pd | |
import numpy as np | |
from lifelines.datasets import load_rossi | |
from lifelines import CoxPHFitter | |
# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH) | |
# N vs (% ties == unique(T) / N) | |
ROSSI_ROWS = 432 | |
results = {} | |
for n_copies in [1, 5, 10, 20, 50]: | |
# lower percents means more ties. | |
# original rossi dataset has 0.113 | |
for fraction in [0.05, 0.25, 0.5, 0.75, 0.95, 1.00]: | |
print(n_copies, fraction) | |
df = pd.concat([load_rossi()] * n_copies) | |
n_unique_durations = int(df.shape[0] * fraction) + 1 | |
unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5) | |
df['week'] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[:df.shape[0]] | |
print(1.0 * df['week'].unique().shape[0] / df.shape[0]) | |
cph_batch = CoxPHFitter(batch_mode=True) | |
start_time = time() | |
cph_batch.fit(df, 'week', 'arrest') | |
batch_time = time() - start_time | |
cph_single = CoxPHFitter(batch_mode=False) | |
start_time = time() | |
cph_single.fit(df, 'week', 'arrest') | |
single_time = time() - start_time | |
results[(n_copies * ROSSI_ROWS, fraction)] = {'batch': batch_time, 'single': single_time} | |
results = pd.DataFrame(results).T.sort_index() | |
results['ratio'] = results['batch'] / results['single'] | |
print(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment