Last active
January 16, 2024 14:14
-
-
Save ispanos/dc2ab9e00d53cdf6c2751be986a6469d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generated by chatgpt 3.5 | |
# Prompt | |
# help me create a dataset with random values and use it to create a benchmark for statsmodels.api.Logit.fit(). | |
# use timeit to measure how long it takes to run | |
# create 40 X's and make it 1000000 observations (on the second prompt) | |
import numpy as np | |
import pandas as pd | |
import statsmodels.api as sm | |
import timeit | |
import csv | |
def generate_dataset(num_features=40, num_observations=1000000, seed=1): | |
np.random.seed(seed) | |
X = np.random.uniform(low=0, high=1, size=(num_observations, num_features)) | |
coefficients = np.arange(1, num_features + 1) | |
y_intercept = 0.5 | |
noise = np.random.normal(loc=0, scale=0.1, size=num_observations) | |
y = y_intercept + np.dot(X, coefficients) + noise | |
# Convert to binary outcome | |
y = (y > np.median(y)).astype(int) | |
# Create a DataFrame | |
columns = [f'x{i+1}' for i in range(num_features)] | |
df = pd.DataFrame(X, columns=columns) | |
df['y'] = y | |
return df | |
# Function to benchmark statsmodels.api.Logit.fit() | |
def benchmark_logit_fit(data): | |
# Exclude the last column (response variable) | |
X = sm.add_constant(data.iloc[:, :-1]) | |
y = data['y'] | |
model = sm.Logit(y, X) | |
def fit_model(): | |
model.fit() | |
# Measure the time it takes to fit the model | |
# Run it only once for larger datasets | |
time_taken = timeit.timeit(fit_model, number=1) | |
return time_taken | |
times = [] | |
output = [['seed', 'time']] | |
i = 0 | |
# datasets = [ for s in range(200)] | |
for s in range(200): | |
dataset = generate_dataset( | |
num_features=20, num_observations=1000000, seed=s) | |
# Benchmarking | |
time_taken = benchmark_logit_fit(dataset) | |
# Print the results | |
# print(f"{10*'='} DATASET {i}") | |
print(f"{10*'='} DATASET {s}") | |
print(f"Time taken to fit the model: {time_taken:.4f} seconds") | |
# i+=1 | |
times.append(time_taken) | |
# output.append([i, time_taken]) | |
output.append([s, time_taken]) | |
output.append(['mean', sum(times)/len(times)]) | |
with open('benchmark.csv', 'w', newline='') as file: | |
writter = csv.writer(file) | |
writter.writerows(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment