Skip to content

Instantly share code, notes, and snippets.

@martinctc
Last active January 15, 2024 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinctc/ca588513d9db62aedcaa603bfd156277 to your computer and use it in GitHub Desktop.
Save martinctc/ca588513d9db62aedcaa603bfd156277 to your computer and use it in GitHub Desktop.
Test run speeds for RF model in Python including simulation
# data cleaning and utility
import numpy as np
import pandas as pd
import vivainsights as vi
import os
# timing code
import time
import random
import sys
# visualizations
import matplotlib as mpl
import matplotlib.pyplot as plt
# machine learning
from sklearn.ensemble import RandomForestClassifier # scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance
start_time = time.time()
# Set relative path to go up one directory and into data folder
raw_data = vi.import_query(os.getcwd() + "\\data\\Top_Performers_Dataset_v2.csv")
# Examine the data
raw_data.head() # first 5 rows
len(raw_data)
# Create a list of n copies of raw_data
df_list = [raw_data.copy() for _ in range(100000)]
# Add randomization
def rand_col(column, iterator, round = False):
random.seed(iterator)
out = column + np.random.normal(0, 1) * column.std()
if round == True:
out = out.round()
elif round == False:
out = out
else:
print("Error: round must be True or False")
# No negatives
out = abs(out)
return out
# Define a function to apply to the columns
def modify_column(df, iterator):
df['PersonId'] = df['PersonId'] + '_'+ str(iterator)
df['Internal_network_size'] = rand_col(df['Internal_network_size'], iterator, round = True)
df['Collaboration_hours'] = rand_col(df['Collaboration_hours'], iterator, round = False)
df['weekend_collaboration_hours'] = rand_col(df['weekend_collaboration_hours'], iterator, round = False)
df['After_hours_call_hours'] = rand_col(df['After_hours_call_hours'], iterator, round = False)
df['performance'] = rand_col(df['performance'], iterator, round = True)
# df['Internal_network_size'] = (df['Internal_network_size'] + np.random.normal(-1, 1) * df['Internal_network_size'].std()).round()
return df
# Apply the function to a column in each DataFrame in the list
for i, df in enumerate(df_list):
df_list[i] = modify_column(df, i)
# row bind
combined_df = pd.concat(df_list, axis=0)
sum_combined_df = pd.DataFrame(
{
"SizeInBytes": sys.getsizeof(combined_df),
"SizeInMB": sys.getsizeof(combined_df) / 1000000,
"Rows": len(combined_df),
"UniquePersonId": len(combined_df['PersonId'].unique()),
"n_predictors": len(combined_df.columns) - 2,
}, index=[0]
)
# Get from raw data to random forest model and outputs
def raw_to_rf(df):
clean_data = df.drop(columns=['PersonId']) # drop PersonId - not required for fitting
# Binary variable where >= 4 indicates High Performance
clean_data['perform_cat'] = np.where(clean_data['performance'] >= 4, 1, 0)
# Split train and test data
outc_var_df = clean_data['perform_cat']
pred_var_df = clean_data.drop(columns=['perform_cat', 'performance'])
x_train, x_test, y_train, y_test = train_test_split(pred_var_df, outc_var_df, test_size = 0.30)
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
# Predict the labels for the test set
y_pred = rf.predict(x_test)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
model_stats = pd.DataFrame(
{
"accuracy": pd.Series(accuracy, index=[0]),
"precision": pd.Series(precision, index=[0]),
"recall": pd.Series(recall, index=[0]),
"f1": pd.Series(f1, index=[0])
# "confusion_matrix": conf_matrix
}
)
# Get feature importance
importance = rf.feature_importances_
# Create a DataFrame with feature names and their importance
importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})
# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values('Importance', ascending=False)
return model_stats, importance_df
raw_to_rf(sum_combined_df)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute RF: {elapsed_time:.3f} seconds")
# Add column to the summary data frame
sum_combined_df['elapsed_time'] = elapsed_time
# Copy `sum_combined_df` to clipboard
sum_combined_df.to_clipboard(index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment