Skip to content

Instantly share code, notes, and snippets.

@ispanos
Last active January 16, 2024 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ispanos/dc2ab9e00d53cdf6c2751be986a6469d to your computer and use it in GitHub Desktop.
Save ispanos/dc2ab9e00d53cdf6c2751be986a6469d to your computer and use it in GitHub Desktop.
# Generated by chatgpt 3.5
# Prompt
# help me create a dataset with random values and use it to create a benchmark for statsmodels.api.Logit.fit().
# use timeit to measure how long it takes to run
# create 40 X's and make it 1000000 observations (on the second prompt)
import numpy as np
import pandas as pd
import statsmodels.api as sm
import timeit
import csv
def generate_dataset(num_features=40, num_observations=1000000, seed=1):
np.random.seed(seed)
X = np.random.uniform(low=0, high=1, size=(num_observations, num_features))
coefficients = np.arange(1, num_features + 1)
y_intercept = 0.5
noise = np.random.normal(loc=0, scale=0.1, size=num_observations)
y = y_intercept + np.dot(X, coefficients) + noise
# Convert to binary outcome
y = (y > np.median(y)).astype(int)
# Create a DataFrame
columns = [f'x{i+1}' for i in range(num_features)]
df = pd.DataFrame(X, columns=columns)
df['y'] = y
return df
# Function to benchmark statsmodels.api.Logit.fit()
def benchmark_logit_fit(data):
# Exclude the last column (response variable)
X = sm.add_constant(data.iloc[:, :-1])
y = data['y']
model = sm.Logit(y, X)
def fit_model():
model.fit()
# Measure the time it takes to fit the model
# Run it only once for larger datasets
time_taken = timeit.timeit(fit_model, number=1)
return time_taken
times = []
output = [['seed', 'time']]
i = 0
# datasets = [ for s in range(200)]
for s in range(200):
dataset = generate_dataset(
num_features=20, num_observations=1000000, seed=s)
# Benchmarking
time_taken = benchmark_logit_fit(dataset)
# Print the results
# print(f"{10*'='} DATASET {i}")
print(f"{10*'='} DATASET {s}")
print(f"Time taken to fit the model: {time_taken:.4f} seconds")
# i+=1
times.append(time_taken)
# output.append([i, time_taken])
output.append([s, time_taken])
output.append(['mean', sum(times)/len(times)])
with open('benchmark.csv', 'w', newline='') as file:
writter = csv.writer(file)
writter.writerows(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment