martinctc/test-python-rf-runtime.py

## test-python-rf-runtime.py
# data cleaning and utility
import numpy as np
import pandas as pd
import vivainsights as vi
import os

# timing code
import time
import random
import sys

# visualizations
import matplotlib as mpl
import matplotlib.pyplot as plt

# machine learning
from sklearn.ensemble import RandomForestClassifier # scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance

start_time = time.time()

# Set relative path to go up one directory and into data folder
raw_data = vi.import_query(os.getcwd() + "\\data\\Top_Performers_Dataset_v2.csv")

# Examine the data
raw_data.head() # first 5 rows
len(raw_data)

# Create a list of n copies of raw_data
df_list = [raw_data.copy() for _ in range(100000)]

# Add randomization
def rand_col(column, iterator, round = False):

    random.seed(iterator)
    out = column + np.random.normal(0, 1) * column.std()

    if round == True:
        out = out.round()
    elif round == False:
        out = out
    else:
        print("Error: round must be True or False")

    # No negatives
    out = abs(out)

    return out


# Define a function to apply to the columns
def modify_column(df, iterator):

    df['PersonId'] = df['PersonId'] + '_'+ str(iterator)
    df['Internal_network_size'] = rand_col(df['Internal_network_size'], iterator, round = True)
    df['Collaboration_hours'] = rand_col(df['Collaboration_hours'], iterator, round = False)
    df['weekend_collaboration_hours'] = rand_col(df['weekend_collaboration_hours'], iterator, round = False)
    df['After_hours_call_hours'] = rand_col(df['After_hours_call_hours'], iterator, round = False)
    df['performance'] = rand_col(df['performance'], iterator, round = True)
    # df['Internal_network_size'] = (df['Internal_network_size'] + np.random.normal(-1, 1) * df['Internal_network_size'].std()).round()

    return df

# Apply the function to a column in each DataFrame in the list
for i, df in enumerate(df_list):
    df_list[i] = modify_column(df, i)

# row bind
combined_df = pd.concat(df_list, axis=0)

sum_combined_df = pd.DataFrame(
        {
            "SizeInBytes": sys.getsizeof(combined_df),
            "SizeInMB": sys.getsizeof(combined_df) / 1000000,
            "Rows": len(combined_df),
            "UniquePersonId": len(combined_df['PersonId'].unique()),
            "n_predictors": len(combined_df.columns) - 2,
        }, index=[0]
    )

# Get from raw data to random forest model and outputs
def raw_to_rf(df):
    clean_data = df.drop(columns=['PersonId']) # drop PersonId - not required for fitting
    # Binary variable where >= 4 indicates High Performance
    clean_data['perform_cat'] = np.where(clean_data['performance'] >= 4, 1, 0)

    # Split train and test data
    outc_var_df = clean_data['perform_cat']
    pred_var_df = clean_data.drop(columns=['perform_cat', 'performance'])

    x_train, x_test, y_train, y_test = train_test_split(pred_var_df, outc_var_df, test_size = 0.30)

    rf = RandomForestClassifier()
    rf.fit(x_train, y_train)

    # Predict the labels for the test set
    y_pred = rf.predict(x_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    model_stats = pd.DataFrame(
        {
            "accuracy": pd.Series(accuracy, index=[0]),
            "precision": pd.Series(precision, index=[0]),
            "recall": pd.Series(recall, index=[0]),
            "f1": pd.Series(f1, index=[0])
            # "confusion_matrix": conf_matrix
        }
    )

    # Get feature importance
    importance = rf.feature_importances_

    # Create a DataFrame with feature names and their importance
    importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})

    # Sort the DataFrame by importance in descending order
    importance_df = importance_df.sort_values('Importance', ascending=False)

    return model_stats, importance_df

raw_to_rf(sum_combined_df)

elapsed_time = time.time() - start_time

print(f"Elapsed time to compute RF: {elapsed_time:.3f} seconds")

# Add column to the summary data frame
sum_combined_df['elapsed_time'] = elapsed_time

# Copy `sum_combined_df` to clipboard
sum_combined_df.to_clipboard(index=False)
	# data cleaning and utility
	import numpy as np
	import pandas as pd
	import vivainsights as vi
	import os

	# timing code
	import time
	import random
	import sys

	# visualizations
	import matplotlib as mpl
	import matplotlib.pyplot as plt

	# machine learning
	from sklearn.ensemble import RandomForestClassifier # scikit-learn
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
	from sklearn.inspection import permutation_importance

	start_time = time.time()

	# Set relative path to go up one directory and into data folder
	raw_data = vi.import_query(os.getcwd() + "\\data\\Top_Performers_Dataset_v2.csv")

	# Examine the data
	raw_data.head() # first 5 rows
	len(raw_data)

	# Create a list of n copies of raw_data
	df_list = [raw_data.copy() for _ in range(100000)]

	# Add randomization
	def rand_col(column, iterator, round = False):

	random.seed(iterator)
	out = column + np.random.normal(0, 1) * column.std()

	if round == True:
	out = out.round()
	elif round == False:
	out = out
	else:
	print("Error: round must be True or False")

	# No negatives
	out = abs(out)

	return out


	# Define a function to apply to the columns
	def modify_column(df, iterator):

	df['PersonId'] = df['PersonId'] + '_'+ str(iterator)
	df['Internal_network_size'] = rand_col(df['Internal_network_size'], iterator, round = True)
	df['Collaboration_hours'] = rand_col(df['Collaboration_hours'], iterator, round = False)
	df['weekend_collaboration_hours'] = rand_col(df['weekend_collaboration_hours'], iterator, round = False)
	df['After_hours_call_hours'] = rand_col(df['After_hours_call_hours'], iterator, round = False)
	df['performance'] = rand_col(df['performance'], iterator, round = True)
	# df['Internal_network_size'] = (df['Internal_network_size'] + np.random.normal(-1, 1) * df['Internal_network_size'].std()).round()

	return df

	# Apply the function to a column in each DataFrame in the list
	for i, df in enumerate(df_list):
	df_list[i] = modify_column(df, i)

	# row bind
	combined_df = pd.concat(df_list, axis=0)

	sum_combined_df = pd.DataFrame(
	{
	"SizeInBytes": sys.getsizeof(combined_df),
	"SizeInMB": sys.getsizeof(combined_df) / 1000000,
	"Rows": len(combined_df),
	"UniquePersonId": len(combined_df['PersonId'].unique()),
	"n_predictors": len(combined_df.columns) - 2,
	}, index=[0]
	)

	# Get from raw data to random forest model and outputs
	def raw_to_rf(df):
	clean_data = df.drop(columns=['PersonId']) # drop PersonId - not required for fitting
	# Binary variable where >= 4 indicates High Performance
	clean_data['perform_cat'] = np.where(clean_data['performance'] >= 4, 1, 0)

	# Split train and test data
	outc_var_df = clean_data['perform_cat']
	pred_var_df = clean_data.drop(columns=['perform_cat', 'performance'])

	x_train, x_test, y_train, y_test = train_test_split(pred_var_df, outc_var_df, test_size = 0.30)

	rf = RandomForestClassifier()
	rf.fit(x_train, y_train)

	# Predict the labels for the test set
	y_pred = rf.predict(x_test)

	# Calculate metrics
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted')
	recall = recall_score(y_test, y_pred, average='weighted')
	f1 = f1_score(y_test, y_pred, average='weighted')
	conf_matrix = confusion_matrix(y_test, y_pred)

	model_stats = pd.DataFrame(
	{
	"accuracy": pd.Series(accuracy, index=[0]),
	"precision": pd.Series(precision, index=[0]),
	"recall": pd.Series(recall, index=[0]),
	"f1": pd.Series(f1, index=[0])
	# "confusion_matrix": conf_matrix
	}
	)

	# Get feature importance
	importance = rf.feature_importances_

	# Create a DataFrame with feature names and their importance
	importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})

	# Sort the DataFrame by importance in descending order
	importance_df = importance_df.sort_values('Importance', ascending=False)

	return model_stats, importance_df

	raw_to_rf(sum_combined_df)

	elapsed_time = time.time() - start_time

	print(f"Elapsed time to compute RF: {elapsed_time:.3f} seconds")

	# Add column to the summary data frame
	sum_combined_df['elapsed_time'] = elapsed_time

	# Copy `sum_combined_df` to clipboard
	sum_combined_df.to_clipboard(index=False)