ispanos/logit_benchmark.py

## logit_benchmark.py
# Generated by chatgpt 3.5

# Prompt
# help me create a dataset with random values and use it to create a benchmark for  statsmodels.api.Logit.fit().
# use timeit to measure how long it takes to run
# create 40 X's and make it 1000000 observations (on the second prompt)


import numpy as np
import pandas as pd
import statsmodels.api as sm
import timeit
import csv

def generate_dataset(num_features=40, num_observations=1000000, seed=1):
    np.random.seed(seed)

    X = np.random.uniform(low=0, high=1, size=(num_observations, num_features))

    coefficients = np.arange(1, num_features + 1)

    y_intercept = 0.5
    noise = np.random.normal(loc=0, scale=0.1, size=num_observations)

    y = y_intercept + np.dot(X, coefficients) + noise

    # Convert to binary outcome
    y = (y > np.median(y)).astype(int)

    # Create a DataFrame
    columns = [f'x{i+1}' for i in range(num_features)]
    df = pd.DataFrame(X, columns=columns)
    df['y'] = y

    return df

# Function to benchmark statsmodels.api.Logit.fit()
def benchmark_logit_fit(data):
    # Exclude the last column (response variable)
    X = sm.add_constant(data.iloc[:, :-1])
    y = data['y']

    model = sm.Logit(y, X)

    def fit_model():
        model.fit()

    # Measure the time it takes to fit the model
    # Run it only once for larger datasets
    time_taken = timeit.timeit(fit_model, number=1)

    return time_taken

times = []
output = [['seed', 'time']]
i = 0


# datasets = [ for s in range(200)]

for s in range(200):
    dataset = generate_dataset(
        num_features=20, num_observations=1000000, seed=s)
    # Benchmarking
    time_taken = benchmark_logit_fit(dataset)

    # Print the results
    # print(f"{10*'='} DATASET {i}")
    print(f"{10*'='} DATASET {s}")
    print(f"Time taken to fit the model: {time_taken:.4f} seconds")
    # i+=1
    times.append(time_taken)
    # output.append([i, time_taken])
    output.append([s, time_taken])

output.append(['mean', sum(times)/len(times)])

with open('benchmark.csv', 'w', newline='') as file:
    writter = csv.writer(file)
    writter.writerows(output)
	# Generated by chatgpt 3.5

	# Prompt
	# help me create a dataset with random values and use it to create a benchmark for statsmodels.api.Logit.fit().
	# use timeit to measure how long it takes to run
	# create 40 X's and make it 1000000 observations (on the second prompt)


	import numpy as np
	import pandas as pd
	import statsmodels.api as sm
	import timeit
	import csv

	def generate_dataset(num_features=40, num_observations=1000000, seed=1):
	np.random.seed(seed)

	X = np.random.uniform(low=0, high=1, size=(num_observations, num_features))

	coefficients = np.arange(1, num_features + 1)

	y_intercept = 0.5
	noise = np.random.normal(loc=0, scale=0.1, size=num_observations)

	y = y_intercept + np.dot(X, coefficients) + noise

	# Convert to binary outcome
	y = (y > np.median(y)).astype(int)

	# Create a DataFrame
	columns = [f'x{i+1}' for i in range(num_features)]
	df = pd.DataFrame(X, columns=columns)
	df['y'] = y

	return df

	# Function to benchmark statsmodels.api.Logit.fit()
	def benchmark_logit_fit(data):
	# Exclude the last column (response variable)
	X = sm.add_constant(data.iloc[:, :-1])
	y = data['y']

	model = sm.Logit(y, X)

	def fit_model():
	model.fit()

	# Measure the time it takes to fit the model
	# Run it only once for larger datasets
	time_taken = timeit.timeit(fit_model, number=1)

	return time_taken

	times = []
	output = [['seed', 'time']]
	i = 0


	# datasets = [ for s in range(200)]

	for s in range(200):
	dataset = generate_dataset(
	num_features=20, num_observations=1000000, seed=s)
	# Benchmarking
	time_taken = benchmark_logit_fit(dataset)

	# Print the results
	# print(f"{10*'='} DATASET {i}")
	print(f"{10*'='} DATASET {s}")
	print(f"Time taken to fit the model: {time_taken:.4f} seconds")
	# i+=1
	times.append(time_taken)
	# output.append([i, time_taken])
	output.append([s, time_taken])

	output.append(['mean', sum(times)/len(times)])

	with open('benchmark.csv', 'w', newline='') as file:
	writter = csv.writer(file)
	writter.writerows(output)