CamDavidsonPilon/cox_performance_batch_single.py

## cox_performance_batch_single.py

from time import time
import pandas as pd
import numpy as np
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter

# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH)
# N vs (% ties == unique(T) / N)


ROSSI_ROWS = 432
results = {}


for n_copies in [1, 5, 10, 20, 50]:

    # lower percents means more ties.
    # original rossi dataset has 0.113
    for fraction in [0.05, 0.25, 0.5, 0.75, 0.95, 1.00]:
        print(n_copies, fraction)


        df = pd.concat([load_rossi()] * n_copies)
        n_unique_durations = int(df.shape[0] * fraction) + 1
        unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5)

        df['week'] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[:df.shape[0]]

        print(1.0 * df['week'].unique().shape[0] / df.shape[0])

        cph_batch = CoxPHFitter(batch_mode=True)
        start_time = time()
        cph_batch.fit(df, 'week', 'arrest')
        batch_time = time() - start_time


        cph_single = CoxPHFitter(batch_mode=False)
        start_time = time()
        cph_single.fit(df, 'week', 'arrest')
        single_time = time() - start_time


        results[(n_copies * ROSSI_ROWS, fraction)] = {'batch': batch_time, 'single': single_time}

results = pd.DataFrame(results).T.sort_index()
results['ratio'] = results['batch'] / results['single']
print(results)

	from time import time
	import pandas as pd
	import numpy as np
	from lifelines.datasets import load_rossi
	from lifelines import CoxPHFitter

	# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH)
	# N vs (% ties == unique(T) / N)


	ROSSI_ROWS = 432
	results = {}


	for n_copies in [1, 5, 10, 20, 50]:

	# lower percents means more ties.
	# original rossi dataset has 0.113
	for fraction in [0.05, 0.25, 0.5, 0.75, 0.95, 1.00]:
	print(n_copies, fraction)


	df = pd.concat([load_rossi()] * n_copies)
	n_unique_durations = int(df.shape[0] * fraction) + 1
	unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5)

	df['week'] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[:df.shape[0]]

	print(1.0 * df['week'].unique().shape[0] / df.shape[0])

	cph_batch = CoxPHFitter(batch_mode=True)
	start_time = time()
	cph_batch.fit(df, 'week', 'arrest')
	batch_time = time() - start_time


	cph_single = CoxPHFitter(batch_mode=False)
	start_time = time()
	cph_single.fit(df, 'week', 'arrest')
	single_time = time() - start_time


	results[(n_copies * ROSSI_ROWS, fraction)] = {'batch': batch_time, 'single': single_time}

	results = pd.DataFrame(results).T.sort_index()
	results['ratio'] = results['batch'] / results['single']
	print(results)