Skip to content

Instantly share code, notes, and snippets.

@preetesh33
Created June 4, 2024 16:14
Show Gist options
  • Save preetesh33/b2447be10dfe4c3e5db53d369961b48f to your computer and use it in GitHub Desktop.
Save preetesh33/b2447be10dfe4c3e5db53d369961b48f to your computer and use it in GitHub Desktop.
import pandas as pd
from mlforecast import MLForecast
from xgboost import XGBRegressor
from window_ops.rolling import rolling_mean, rolling_max, rolling_min, rolling_std
from mlforecast.forecast import MLForecast
from mlforecast import MLForecast
import os
from joblib import dump
import json
# Suppressing the warning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
organized_file = "/opt/ML/final/data_augmentation/augmentation/merged_main.csv"
df_grouped = pd.read_csv(organized_file)
# print(df_grouped.head(10))
df_grouped['timestamp'] = pd.to_datetime(df_grouped['timestamp'])
# Convert object columns to category type
df_grouped['location'] = df_grouped['location'].astype('category')
df_grouped['customer'] = df_grouped['customer'].astype('category')
df_grouped['cluster'] = df_grouped['cluster'].astype('category')
df_grouped['project'] = df_grouped['project'].astype('category')
# Create a mapping of codes to labels for each category column dictionary
location_mapping_dict = dict(enumerate(df_grouped['location'].cat.categories))
customer_mapping_dict = dict(enumerate(df_grouped['customer'].cat.categories))
cluster_mapping_dict = dict(enumerate(df_grouped['cluster'].cat.categories))
project_mapping_dict = dict(enumerate(df_grouped['project'].cat.categories))
################################# Save the mapping dictionaries as json files ##############################
with open('location_mapping.json', 'w') as f:
json.dump(location_mapping_dict, f, indent=4)
with open('customer_mapping.json', 'w') as f:
json.dump(customer_mapping_dict, f, indent=4)
with open('cluster_mapping.json', 'w') as f:
json.dump(cluster_mapping_dict, f, indent=4)
with open('project_mapping.json', 'w') as f:
json.dump(project_mapping_dict, f, indent=4)
###########################################################################################################
# Convert category columns to integer encoding
df_grouped['location'] = df_grouped['location'].cat.codes
df_grouped['customer'] = df_grouped['customer'].cat.codes
df_grouped['cluster'] = df_grouped['cluster'].cat.codes
df_grouped['project'] = df_grouped['project'].cat.codes
# Split data into train and valid sets
train=df_grouped[df_grouped['timestamp']<='2024-02-15'] #y/m/d
# Create an ID column for each unique ID in the training data
train['id_col'] = train['location'].astype(str) + '_' + train['customer'].astype(str) + '_' + train['cluster'].astype(str) + '_' + train['project'].astype(str)
print("Data split in train set")
print("train head",train.head(10))
################################################## Training ###################################################
model=XGBRegressor(random_state=990, n_estimators=500, learning_rate=0.01, max_depth=10, reg_lambda=0.2)
print("part1")
fcst=MLForecast(model, freq='min',lags=[60], lag_transforms={1:[(rolling_mean, 60),(rolling_max, 60),(rolling_min, 60),(rolling_std, 60)]},
date_features=['day','hour','minute'],num_threads=48)
print("part2")
fcst.fit(train, id_col='id_col', time_col='timestamp', target_col='totalcalls', static_features=['location','customer','cluster','project'])
print("part3")
######################## Save model ##################################################
print("Saving model")
dump(fcst, 'test_train_1.joblib')
####################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment