Last active
January 15, 2024 14:20
-
-
Save martinctc/ca588513d9db62aedcaa603bfd156277 to your computer and use it in GitHub Desktop.
Test run speeds for RF model in Python including simulation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data cleaning and utility | |
import numpy as np | |
import pandas as pd | |
import vivainsights as vi | |
import os | |
# timing code | |
import time | |
import random | |
import sys | |
# visualizations | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
# machine learning | |
from sklearn.ensemble import RandomForestClassifier # scikit-learn | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
from sklearn.inspection import permutation_importance | |
start_time = time.time() | |
# Set relative path to go up one directory and into data folder | |
raw_data = vi.import_query(os.getcwd() + "\\data\\Top_Performers_Dataset_v2.csv") | |
# Examine the data | |
raw_data.head() # first 5 rows | |
len(raw_data) | |
# Create a list of n copies of raw_data | |
df_list = [raw_data.copy() for _ in range(100000)] | |
# Add randomization | |
def rand_col(column, iterator, round = False): | |
random.seed(iterator) | |
out = column + np.random.normal(0, 1) * column.std() | |
if round == True: | |
out = out.round() | |
elif round == False: | |
out = out | |
else: | |
print("Error: round must be True or False") | |
# No negatives | |
out = abs(out) | |
return out | |
# Define a function to apply to the columns | |
def modify_column(df, iterator): | |
df['PersonId'] = df['PersonId'] + '_'+ str(iterator) | |
df['Internal_network_size'] = rand_col(df['Internal_network_size'], iterator, round = True) | |
df['Collaboration_hours'] = rand_col(df['Collaboration_hours'], iterator, round = False) | |
df['weekend_collaboration_hours'] = rand_col(df['weekend_collaboration_hours'], iterator, round = False) | |
df['After_hours_call_hours'] = rand_col(df['After_hours_call_hours'], iterator, round = False) | |
df['performance'] = rand_col(df['performance'], iterator, round = True) | |
# df['Internal_network_size'] = (df['Internal_network_size'] + np.random.normal(-1, 1) * df['Internal_network_size'].std()).round() | |
return df | |
# Apply the function to a column in each DataFrame in the list | |
for i, df in enumerate(df_list): | |
df_list[i] = modify_column(df, i) | |
# row bind | |
combined_df = pd.concat(df_list, axis=0) | |
sum_combined_df = pd.DataFrame( | |
{ | |
"SizeInBytes": sys.getsizeof(combined_df), | |
"SizeInMB": sys.getsizeof(combined_df) / 1000000, | |
"Rows": len(combined_df), | |
"UniquePersonId": len(combined_df['PersonId'].unique()), | |
"n_predictors": len(combined_df.columns) - 2, | |
}, index=[0] | |
) | |
# Get from raw data to random forest model and outputs | |
def raw_to_rf(df): | |
clean_data = df.drop(columns=['PersonId']) # drop PersonId - not required for fitting | |
# Binary variable where >= 4 indicates High Performance | |
clean_data['perform_cat'] = np.where(clean_data['performance'] >= 4, 1, 0) | |
# Split train and test data | |
outc_var_df = clean_data['perform_cat'] | |
pred_var_df = clean_data.drop(columns=['perform_cat', 'performance']) | |
x_train, x_test, y_train, y_test = train_test_split(pred_var_df, outc_var_df, test_size = 0.30) | |
rf = RandomForestClassifier() | |
rf.fit(x_train, y_train) | |
# Predict the labels for the test set | |
y_pred = rf.predict(x_test) | |
# Calculate metrics | |
accuracy = accuracy_score(y_test, y_pred) | |
precision = precision_score(y_test, y_pred, average='weighted') | |
recall = recall_score(y_test, y_pred, average='weighted') | |
f1 = f1_score(y_test, y_pred, average='weighted') | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
model_stats = pd.DataFrame( | |
{ | |
"accuracy": pd.Series(accuracy, index=[0]), | |
"precision": pd.Series(precision, index=[0]), | |
"recall": pd.Series(recall, index=[0]), | |
"f1": pd.Series(f1, index=[0]) | |
# "confusion_matrix": conf_matrix | |
} | |
) | |
# Get feature importance | |
importance = rf.feature_importances_ | |
# Create a DataFrame with feature names and their importance | |
importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance}) | |
# Sort the DataFrame by importance in descending order | |
importance_df = importance_df.sort_values('Importance', ascending=False) | |
return model_stats, importance_df | |
raw_to_rf(sum_combined_df) | |
elapsed_time = time.time() - start_time | |
print(f"Elapsed time to compute RF: {elapsed_time:.3f} seconds") | |
# Add column to the summary data frame | |
sum_combined_df['elapsed_time'] = elapsed_time | |
# Copy `sum_combined_df` to clipboard | |
sum_combined_df.to_clipboard(index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment