preetesh33/training model

## training model
import pandas as pd
from mlforecast import MLForecast
from xgboost import XGBRegressor
from window_ops.rolling import rolling_mean, rolling_max, rolling_min, rolling_std
from mlforecast.forecast import MLForecast
from mlforecast import MLForecast
import os
from joblib import dump
import json


# Suppressing the warning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

organized_file = "/opt/ML/final/data_augmentation/augmentation/merged_main.csv"

df_grouped = pd.read_csv(organized_file)
# print(df_grouped.head(10))
df_grouped['timestamp'] = pd.to_datetime(df_grouped['timestamp'])
# Convert object columns to category type
df_grouped['location'] = df_grouped['location'].astype('category')
df_grouped['customer'] = df_grouped['customer'].astype('category')
df_grouped['cluster'] = df_grouped['cluster'].astype('category')
df_grouped['project'] = df_grouped['project'].astype('category')

# Create a mapping of codes to labels for each category column dictionary
location_mapping_dict = dict(enumerate(df_grouped['location'].cat.categories))
customer_mapping_dict = dict(enumerate(df_grouped['customer'].cat.categories))
cluster_mapping_dict = dict(enumerate(df_grouped['cluster'].cat.categories))
project_mapping_dict = dict(enumerate(df_grouped['project'].cat.categories))

################################# Save the mapping dictionaries as json files ##############################
with open('location_mapping.json', 'w') as f:
    json.dump(location_mapping_dict, f, indent=4)
with open('customer_mapping.json', 'w') as f:
    json.dump(customer_mapping_dict, f, indent=4)
with open('cluster_mapping.json', 'w') as f:
    json.dump(cluster_mapping_dict, f, indent=4)
with open('project_mapping.json', 'w') as f:
    json.dump(project_mapping_dict, f, indent=4)
###########################################################################################################

# Convert category columns to integer encoding
df_grouped['location'] = df_grouped['location'].cat.codes
df_grouped['customer'] = df_grouped['customer'].cat.codes
df_grouped['cluster'] = df_grouped['cluster'].cat.codes
df_grouped['project'] = df_grouped['project'].cat.codes

# Split data into train and valid sets
train=df_grouped[df_grouped['timestamp']<='2024-02-15'] #y/m/d

# Create an ID column for each unique ID in the training data
train['id_col'] = train['location'].astype(str) + '_' + train['customer'].astype(str) + '_' + train['cluster'].astype(str) + '_' + train['project'].astype(str)

print("Data split in train set")
print("train head",train.head(10))

################################################## Training ###################################################

model=XGBRegressor(random_state=990, n_estimators=500, learning_rate=0.01, max_depth=10, reg_lambda=0.2)
print("part1")
fcst=MLForecast(model, freq='min',lags=[60], lag_transforms={1:[(rolling_mean, 60),(rolling_max, 60),(rolling_min, 60),(rolling_std, 60)]},
              date_features=['day','hour','minute'],num_threads=48)
print("part2")
fcst.fit(train, id_col='id_col', time_col='timestamp', target_col='totalcalls', static_features=['location','customer','cluster','project'])
print("part3")

######################## Save model ##################################################
print("Saving model")
dump(fcst, 'test_train_1.joblib')
####################################################################################
	import pandas as pd
	from mlforecast import MLForecast
	from xgboost import XGBRegressor
	from window_ops.rolling import rolling_mean, rolling_max, rolling_min, rolling_std
	from mlforecast.forecast import MLForecast
	from mlforecast import MLForecast
	import os
	from joblib import dump
	import json


	# Suppressing the warning
	import warnings
	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=DeprecationWarning)

	organized_file = "/opt/ML/final/data_augmentation/augmentation/merged_main.csv"

	df_grouped = pd.read_csv(organized_file)
	# print(df_grouped.head(10))
	df_grouped['timestamp'] = pd.to_datetime(df_grouped['timestamp'])
	# Convert object columns to category type
	df_grouped['location'] = df_grouped['location'].astype('category')
	df_grouped['customer'] = df_grouped['customer'].astype('category')
	df_grouped['cluster'] = df_grouped['cluster'].astype('category')
	df_grouped['project'] = df_grouped['project'].astype('category')

	# Create a mapping of codes to labels for each category column dictionary
	location_mapping_dict = dict(enumerate(df_grouped['location'].cat.categories))
	customer_mapping_dict = dict(enumerate(df_grouped['customer'].cat.categories))
	cluster_mapping_dict = dict(enumerate(df_grouped['cluster'].cat.categories))
	project_mapping_dict = dict(enumerate(df_grouped['project'].cat.categories))

	################################# Save the mapping dictionaries as json files ##############################
	with open('location_mapping.json', 'w') as f:
	json.dump(location_mapping_dict, f, indent=4)
	with open('customer_mapping.json', 'w') as f:
	json.dump(customer_mapping_dict, f, indent=4)
	with open('cluster_mapping.json', 'w') as f:
	json.dump(cluster_mapping_dict, f, indent=4)
	with open('project_mapping.json', 'w') as f:
	json.dump(project_mapping_dict, f, indent=4)
	###########################################################################################################

	# Convert category columns to integer encoding
	df_grouped['location'] = df_grouped['location'].cat.codes
	df_grouped['customer'] = df_grouped['customer'].cat.codes
	df_grouped['cluster'] = df_grouped['cluster'].cat.codes
	df_grouped['project'] = df_grouped['project'].cat.codes

	# Split data into train and valid sets
	train=df_grouped[df_grouped['timestamp']<='2024-02-15'] #y/m/d

	# Create an ID column for each unique ID in the training data
	train['id_col'] = train['location'].astype(str) + '_' + train['customer'].astype(str) + '_' + train['cluster'].astype(str) + '_' + train['project'].astype(str)

	print("Data split in train set")
	print("train head",train.head(10))

	################################################## Training ###################################################

	model=XGBRegressor(random_state=990, n_estimators=500, learning_rate=0.01, max_depth=10, reg_lambda=0.2)
	print("part1")
	fcst=MLForecast(model, freq='min',lags=[60], lag_transforms={1:[(rolling_mean, 60),(rolling_max, 60),(rolling_min, 60),(rolling_std, 60)]},
	date_features=['day','hour','minute'],num_threads=48)
	print("part2")
	fcst.fit(train, id_col='id_col', time_col='timestamp', target_col='totalcalls', static_features=['location','customer','cluster','project'])
	print("part3")

	######################## Save model ##################################################
	print("Saving model")
	dump(fcst, 'test_train_1.joblib')
	####################################################################################