snewcomer/cal_time_pred_outlook.py

## cal_time_pred_outlook.py
import pandas as pd
import numpy as np
from datetime import datetime, time
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

cal = pd.read_csv('emily_cal.csv', parse_dates=[['Start Date', 'Start Time'], ['Start Date', 'End Time']])

# clean summary
cal = cal.query("Subject not in ('Pick up Momo')")
cal['all_day'] = cal['All day event'].map({'FALSE': False, 'TRUE': True})
cal = cal.query("all_day == False")
cal = cal.drop_duplicates()
cal = cal[['Subject', 'Start Date_Start Time', 'Start Date_End Time']]
cal.rename(columns={'Start Date_Start Time': 'dtstart', 'Start Date_End Time': 'dtend'}, inplace=True)
cal['dtstart'] = pd.to_datetime(cal['dtstart'])
cal['dtend'] = pd.to_datetime(cal['dtend'])

# create unique column we can group and resample over
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
# still might have problems, drop Zulu
cal = cal.dropna()
cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

cal['StartTime'] = cal['dtstart'].dt.time
cal['EndTime'] = cal['dtend'].dt.time
cal = cal.melt(id_vars=['Subject', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
cal['Busy'] = 1
# conditional create column that we will ultimately resample
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
cal['TimeOfDay'] = cal['DateTime'].dt.time
# resample drops non numeric columns
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
cal['Busy'] = cal['Busy'].fillna(0)
cal['Weekday'] = cal['DateTime'].dt.weekday

# convert to int b/c models require it
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
cal.set_index('DateTime', inplace=True)

# predict for each weekday
DateIndex = {
  0: 'Monday',
  1: 'Tuesday',
  2: 'Wednesday',
  3: 'Thursday',
  4: 'Friday',
}
for weekday in range(0, 5):
	print("{}".format(DateIndex[weekday]))

	group = cal.groupby('Weekday').get_group(weekday)

	features = group.drop(['Busy'], axis=1)
	target = group['Busy']

	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_test)

	score = r2_score(y_test, xgb_pred)
	r_squared = xgb_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
	print("XGB Score: {}".format(score))
	print("XGB R^2: {}".format(r_squared))
	print("XGB RMSE: {} \n".format(rmse))

	# low r2 score.  Best is 1.0
	# Linear Regression
	#lin_reg = LinearRegression()
	#lin_reg.fit(X_train, y_train)
	#y_pred = lin_reg.predict(X_test)
	#score = r2_score(y_test, y_pred)
	#r_squared = lin_reg.score(X_test, y_test)
	#rmse = mean_squared_error(y_test, y_pred, squared=False)
	#print("Linear Regression Score: {}".format(score))
	#print("Linear Regression R^2: {}".format(r_squared))
	#print("Linear Regression RMSE: {} \n".format(rmse))

	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
	forest_model.fit(X_train, y_train)
	forest_pred = forest_model.predict(X_test)
	score = r2_score(y_test, forest_pred)
	r_squared = forest_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, forest_pred, squared=False)
	print("Random Forest Score: {}".format(score))
	print("Random Forest R^2: {}".format(r_squared))
	print("Random Forest RMSE: {} \n".format(rmse))

	knn_model = KNeighborsClassifier(n_neighbors=6)
	knn_model.fit(X_train, y_train)
	knn_pred = knn_model.predict(X_test)
	score = r2_score(y_test, knn_pred)
	r_squared = knn_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, knn_pred, squared=False)
	print("KNN Score: {}".format(score))
	print("KNN ^2: {}".format(r_squared))
	print("KNN RMSE: {} \n".format(rmse))

	# predict probability at each 30 min interval
	xgb_pred_prob = xgb_model.predict_proba(X_test)
	preds = xgb_pred_prob[:,1]
	preds = pd.DataFrame(preds, columns=['Busy'])
	preds['Probability'] = preds['Busy']
	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.5 else 0)
	preds['DateTime'] = pd.to_datetime(y_test.index)
	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

	preds = preds.drop_duplicates('Time')
	preds.sort_values(by=['Time'], inplace=True)
	preds.drop('DateTime', axis=1, inplace=True)
	preds = preds[preds['Time'] >= time(9)]
	preds = preds[preds['Time'] <= time(18)]
	preds.rename(columns={'Time': '{} Time'.format(DateIndex[weekday])}, inplace=True)
	preds = preds.set_index('{} Time'.format(DateIndex[weekday]))

	print("{}".format(preds))

	logreg = LogisticRegression()
	logreg.fit(X_train, y_train)
	logreg_pred = logreg.predict(X_test)
	logreg_pred_probs = logreg.predict_proba(X_test)[:, 1] # slice positive class
	# note LogisticRegression is not as good as predicting a chance model like KNN or XGB
	print("ROC AUC: {}".format(roc_auc_score(y_test, logreg_pred_probs)))

	# no indicative coef
	#lasso = Lasso(alpha=0.3)
	#Qlasso.fit(features, target)
	#lasso_coef = lasso.coef_
	#print("Lasso coef: {}".format(lasso_coef))

	print("XGB: Model performance?")
	print(confusion_matrix(y_test, xgb_pred))
	print(classification_report(y_test, xgb_pred))

	print("KNN: Model performance?")
	print(confusion_matrix(y_test, knn_pred))
	print(classification_report(y_test, knn_pred))

	#plt.bar(["Weekday", "Hour", "Minute"], lasso_coef)
	#plt.xticks(rotation=45)
	#plt.show()
	import pandas as pd
	import numpy as np
	from datetime import datetime, time
	from sklearn.linear_model import Lasso, Ridge, LogisticRegression
	from sklearn.metrics import classification_report, confusion_matrix
	import matplotlib.pyplot as plt

	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.neighbors import KNeighborsClassifier
	import xgboost as xgb

	cal = pd.read_csv('emily_cal.csv', parse_dates=[['Start Date', 'Start Time'], ['Start Date', 'End Time']])

	# clean summary
	cal = cal.query("Subject not in ('Pick up Momo')")
	cal['all_day'] = cal['All day event'].map({'FALSE': False, 'TRUE': True})
	cal = cal.query("all_day == False")
	cal = cal.drop_duplicates()
	cal = cal[['Subject', 'Start Date_Start Time', 'Start Date_End Time']]
	cal.rename(columns={'Start Date_Start Time': 'dtstart', 'Start Date_End Time': 'dtend'}, inplace=True)
	cal['dtstart'] = pd.to_datetime(cal['dtstart'])
	cal['dtend'] = pd.to_datetime(cal['dtend'])

	# create unique column we can group and resample over
	cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
	# still might have problems, drop Zulu
	cal = cal.dropna()
	cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

	cal['StartTime'] = cal['dtstart'].dt.time
	cal['EndTime'] = cal['dtend'].dt.time
	cal = cal.melt(id_vars=['Subject', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
	cal['Busy'] = 1
	# conditional create column that we will ultimately resample
	cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

	cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
	cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

	cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
	cal['TimeOfDay'] = cal['DateTime'].dt.time
	# resample drops non numeric columns
	cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
	cal['Busy'] = cal['Busy'].fillna(0)
	cal['Weekday'] = cal['DateTime'].dt.weekday

	# convert to int b/c models require it
	cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
	cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
	cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
	cal.set_index('DateTime', inplace=True)

	# predict for each weekday
	DateIndex = {
	0: 'Monday',
	1: 'Tuesday',
	2: 'Wednesday',
	3: 'Thursday',
	4: 'Friday',
	}
	for weekday in range(0, 5):
	print("{}".format(DateIndex[weekday]))

	group = cal.groupby('Weekday').get_group(weekday)

	features = group.drop(['Busy'], axis=1)
	target = group['Busy']

	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_test)

	score = r2_score(y_test, xgb_pred)
	r_squared = xgb_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
	print("XGB Score: {}".format(score))
	print("XGB R^2: {}".format(r_squared))
	print("XGB RMSE: {} \n".format(rmse))

	# low r2 score. Best is 1.0
	# Linear Regression
	#lin_reg = LinearRegression()
	#lin_reg.fit(X_train, y_train)
	#y_pred = lin_reg.predict(X_test)
	#score = r2_score(y_test, y_pred)
	#r_squared = lin_reg.score(X_test, y_test)
	#rmse = mean_squared_error(y_test, y_pred, squared=False)
	#print("Linear Regression Score: {}".format(score))
	#print("Linear Regression R^2: {}".format(r_squared))
	#print("Linear Regression RMSE: {} \n".format(rmse))

	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
	forest_model.fit(X_train, y_train)
	forest_pred = forest_model.predict(X_test)
	score = r2_score(y_test, forest_pred)
	r_squared = forest_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, forest_pred, squared=False)
	print("Random Forest Score: {}".format(score))
	print("Random Forest R^2: {}".format(r_squared))
	print("Random Forest RMSE: {} \n".format(rmse))

	knn_model = KNeighborsClassifier(n_neighbors=6)
	knn_model.fit(X_train, y_train)
	knn_pred = knn_model.predict(X_test)
	score = r2_score(y_test, knn_pred)
	r_squared = knn_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, knn_pred, squared=False)
	print("KNN Score: {}".format(score))
	print("KNN ^2: {}".format(r_squared))
	print("KNN RMSE: {} \n".format(rmse))

	# predict probability at each 30 min interval
	xgb_pred_prob = xgb_model.predict_proba(X_test)
	preds = xgb_pred_prob[:,1]
	preds = pd.DataFrame(preds, columns=['Busy'])
	preds['Probability'] = preds['Busy']
	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.5 else 0)
	preds['DateTime'] = pd.to_datetime(y_test.index)
	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

	preds = preds.drop_duplicates('Time')
	preds.sort_values(by=['Time'], inplace=True)
	preds.drop('DateTime', axis=1, inplace=True)
	preds = preds[preds['Time'] >= time(9)]
	preds = preds[preds['Time'] <= time(18)]
	preds.rename(columns={'Time': '{} Time'.format(DateIndex[weekday])}, inplace=True)
	preds = preds.set_index('{} Time'.format(DateIndex[weekday]))

	print("{}".format(preds))

	logreg = LogisticRegression()
	logreg.fit(X_train, y_train)
	logreg_pred = logreg.predict(X_test)
	logreg_pred_probs = logreg.predict_proba(X_test)[:, 1] # slice positive class
	# note LogisticRegression is not as good as predicting a chance model like KNN or XGB
	print("ROC AUC: {}".format(roc_auc_score(y_test, logreg_pred_probs)))

	# no indicative coef
	#lasso = Lasso(alpha=0.3)
	#Qlasso.fit(features, target)
	#lasso_coef = lasso.coef_
	#print("Lasso coef: {}".format(lasso_coef))

	print("XGB: Model performance?")
	print(confusion_matrix(y_test, xgb_pred))
	print(classification_report(y_test, xgb_pred))

	print("KNN: Model performance?")
	print(confusion_matrix(y_test, knn_pred))
	print(classification_report(y_test, knn_pred))

	#plt.bar(["Weekday", "Hour", "Minute"], lasso_coef)
	#plt.xticks(rotation=45)
	#plt.show()