snewcomer/cal_time_pred.py

## cal_time_pred.py
import pandas as pd
import numpy as np
from datetime import datetime, time

#import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

cal = pd.read_csv('cal.csv', index_col=False)

# clean summary
cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')")
cal = cal[cal['summary'].str.contains("Company Holiday") == False]
cal = cal[cal['summary'].str.contains("L&L") == False]
cal = cal.drop_duplicates()

# clean dtstart
parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce"))
ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D"))
parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce"))
ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D"))

cal['dtstart'] = parsed.fillna(ordinal)
cal['dtend'] = parsed_e.fillna(ordinal_e)
# create unique column we can group and resample over
cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
# still might have problems, drop Zulu
cal = cal.dropna()
cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None))
cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None))
cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

cal['Date'] = cal['dtstart'].dt.date
cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce")
cal['StartTime'] = cal['dtstart'].dt.time
cal['EndTime'] = cal['dtend'].dt.time

cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
cal['Busy'] = 1
# conditional create column that we will ultimately resample
cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
cal['TimeOfDay'] = cal['DateTime'].dt.time
# resample drops non numeric columns
cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
cal['Busy'] = cal['Busy'].fillna(0)
cal['Weekday'] = cal['DateTime'].dt.weekday

# convert to int b/c models require it
#cal['Day'] = cal['DateTime'].apply(lambda time: time.day)
cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
#cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str)
cal.set_index('DateTime', inplace=True)

# predict for each weekday
for weekday in range(0, 6):
	print("Weekday {} \n".format(weekday))

	group = cal.groupby('Weekday').get_group(weekday)

	features = group.drop(['Busy'], axis=1)
	target = group['Busy']

	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_test)

	score = r2_score(y_test, xgb_pred)
	r_squared = xgb_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
	print("XGB Score: {}".format(score))
	print("XGB R^2: {}".format(r_squared))
	print("XGB RMSE: {} \n".format(rmse))

	# low r2 score.  Best is 1.0
	# Linear Regression
	#lin_reg = LinearRegression()
	#lin_reg.fit(X_train, y_train)
	#y_pred = lin_reg.predict(X_test)
	#score = r2_score(y_test, y_pred)
	#r_squared = lin_reg.score(X_test, y_test)
	#rmse = mean_squared_error(y_test, y_pred, squared=False)
	#print("Linear Regression Score: {}".format(score))
	#print("Linear Regression R^2: {}".format(r_squared))
	#print("Linear Regression RMSE: {} \n".format(rmse))

	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
	forest_model.fit(X_train, y_train)
	forest_pred = forest_model.predict(X_test)
	score = r2_score(y_test, forest_pred)
	r_squared = forest_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, forest_pred, squared=False)
	print("Random Forest Score: {}".format(score))
	print("Random Forest R^2: {}".format(r_squared))
	print("Random Forest RMSE: {} \n".format(rmse))

	# predict probability
	xgb_pred_prob = xgb_model.predict_proba(X_test)
	preds = xgb_pred_prob[:,1]
	preds = pd.DataFrame(preds, columns=['Busy'])
	preds['Probability'] = preds['Busy']
	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0)
	preds['DateTime'] = pd.to_datetime(y_test.index)
	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

	preds = preds.drop_duplicates('Time')
	preds.sort_values(by=['Time'], inplace=True)
	preds.drop('DateTime', axis=1, inplace=True)
	preds = preds[preds['Time'] > time(9)]
	preds = preds[preds['Time'] < time(18)]
	print("Weekday \n {}: {}".format(weekday, preds))
	import pandas as pd
	import numpy as np
	from datetime import datetime, time

	#import matplotlib.pyplot as plt

	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import r2_score, mean_squared_error
	from sklearn.ensemble import RandomForestClassifier
	import xgboost as xgb

	cal = pd.read_csv('cal.csv', index_col=False)

	# clean summary
	cal = cal.query("summary not in ('Mid-week Meditation', 'Coffee Chat: Meet our New Hires!', 'PED Lunch & Learn Hold', 'AskPeople')")
	cal = cal[cal['summary'].str.contains("Company Holiday") == False]
	cal = cal[cal['summary'].str.contains("L&L") == False]
	cal = cal.drop_duplicates()

	# clean dtstart
	parsed = pd.to_datetime(cal["dtstart"], errors="coerce").fillna(pd.to_datetime(cal["dtstart"], format="%Y-%d-%m", errors="coerce"))
	ordinal = pd.to_numeric(cal["dtstart"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30") + pd.Timedelta(x, unit="D"))
	parsed_e = pd.to_datetime(cal["dtend"], errors="coerce").fillna(pd.to_datetime(cal["dtend"], format="%Y-%d-%m", errors="coerce"))
	ordinal_e = pd.to_numeric(cal["dtend"], errors="coerce").apply(lambda x: pd.Timestamp("1899-12-30")+pd.Timedelta(x, unit="D"))

	cal['dtstart'] = parsed.fillna(ordinal)
	cal['dtend'] = parsed_e.fillna(ordinal_e)
	# create unique column we can group and resample over
	cal['DtCombined'] = cal['dtstart'].astype(str) + '_' + cal['dtend'].astype(str)
	# still might have problems, drop Zulu
	cal = cal.dropna()
	cal['dtstart'] = cal['dtstart'].apply(lambda x: datetime.replace(x, tzinfo=None))
	cal['dtend'] = cal['dtend'].apply(lambda x: datetime.replace(x, tzinfo=None))
	cal['MeetingLength'] = cal['dtend'] - cal['dtstart']

	cal['Date'] = cal['dtstart'].dt.date
	cal['Date'] = pd.to_datetime(cal['Date'], errors="coerce")
	cal['StartTime'] = cal['dtstart'].dt.time
	cal['EndTime'] = cal['dtend'].dt.time

	cal = cal.melt(id_vars=['summary', 'Date', 'dtstart', 'dtend', 'DtCombined', 'MeetingLength'], var_name='Start/End', value_name='TimeOfDay')
	cal['Busy'] = 1
	# conditional create column that we will ultimately resample
	cal['DateTime'] = np.where(cal['Start/End'] == 'StartTime', cal['dtstart'], cal['dtend'])

	cal.sort_values(by=['dtstart', 'TimeOfDay'], inplace=True)
	cal.drop(['dtstart', 'dtend', 'Start/End'], axis=1, inplace=True)

	cal = cal.groupby('DtCombined').apply(lambda x: x.drop_duplicates('DateTime').set_index('DateTime').resample('30Min').ffill()).reset_index('DtCombined', drop=True).reset_index()
	cal['TimeOfDay'] = cal['DateTime'].dt.time
	# resample drops non numeric columns
	cal = cal.set_index('DateTime').resample('30Min').mean().reset_index()
	cal['Busy'] = cal['Busy'].fillna(0)
	cal['Weekday'] = cal['DateTime'].dt.weekday

	# convert to int b/c models require it
	#cal['Day'] = cal['DateTime'].apply(lambda time: time.day)
	cal = cal.set_index('DateTime')['2021-05-01' :'2023-01-08'].reset_index()
	cal['Hour'] = cal['DateTime'].apply(lambda time: time.hour)
	cal['Minute'] = cal['DateTime'].apply(lambda time: time.minute)
	#cal['Hour:Minute'] = cal['Hour'].astype(str) + ':' + cal['Minute'].astype(str)
	cal.set_index('DateTime', inplace=True)

	# predict for each weekday
	for weekday in range(0, 6):
	print("Weekday {} \n".format(weekday))

	group = cal.groupby('Weekday').get_group(weekday)

	features = group.drop(['Busy'], axis=1)
	target = group['Busy']

	X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

	xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.5, objective='reg:squarederror')
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_test)

	score = r2_score(y_test, xgb_pred)
	r_squared = xgb_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, xgb_pred, squared=False)
	print("XGB Score: {}".format(score))
	print("XGB R^2: {}".format(r_squared))
	print("XGB RMSE: {} \n".format(rmse))

	# low r2 score. Best is 1.0
	# Linear Regression
	#lin_reg = LinearRegression()
	#lin_reg.fit(X_train, y_train)
	#y_pred = lin_reg.predict(X_test)
	#score = r2_score(y_test, y_pred)
	#r_squared = lin_reg.score(X_test, y_test)
	#rmse = mean_squared_error(y_test, y_pred, squared=False)
	#print("Linear Regression Score: {}".format(score))
	#print("Linear Regression R^2: {}".format(r_squared))
	#print("Linear Regression RMSE: {} \n".format(rmse))

	forest_model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)
	forest_model.fit(X_train, y_train)
	forest_pred = forest_model.predict(X_test)
	score = r2_score(y_test, forest_pred)
	r_squared = forest_model.score(X_test, y_test)
	rmse = mean_squared_error(y_test, forest_pred, squared=False)
	print("Random Forest Score: {}".format(score))
	print("Random Forest R^2: {}".format(r_squared))
	print("Random Forest RMSE: {} \n".format(rmse))

	# predict probability
	xgb_pred_prob = xgb_model.predict_proba(X_test)
	preds = xgb_pred_prob[:,1]
	preds = pd.DataFrame(preds, columns=['Busy'])
	preds['Probability'] = preds['Busy']
	preds['Busy'] = preds['Probability'].apply(lambda prob: 1 if prob > 0.4 else 0)
	preds['DateTime'] = pd.to_datetime(y_test.index)
	preds['Time'] = preds['DateTime'].dt.time#strftime("%H:%M:%S")

	preds = preds.drop_duplicates('Time')
	preds.sort_values(by=['Time'], inplace=True)
	preds.drop('DateTime', axis=1, inplace=True)
	preds = preds[preds['Time'] > time(9)]
	preds = preds[preds['Time'] < time(18)]
	print("Weekday \n {}: {}".format(weekday, preds))