Skip to content

Instantly share code, notes, and snippets.

@bastosmichael
Last active November 21, 2023 05:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bastosmichael/2042249dc593b8ebdbd5ea9aca0cf026 to your computer and use it in GitHub Desktop.
Save bastosmichael/2042249dc593b8ebdbd5ea9aca0cf026 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import random
# Function to generate random dates
def generate_random_dates(start_date, end_date, num_dates):
date_range = pd.date_range(start_date, end_date).to_pydatetime().tolist()
return random.choices(date_range, k=num_dates)
# Generating made-up data
np.random.seed(0)
num_orders = 100
start_date = "2020-01-01"
end_date = "2023-12-31"
data = {
"vendor_id": np.random.randint(1, 10, size=num_orders),
"order_size": np.random.choice(["small", "medium", "large"], size=num_orders),
"season": np.random.choice(
["winter", "spring", "summer", "autumn"], size=num_orders
),
"original_estimated_date": generate_random_dates(start_date, end_date, num_orders),
"updated_delivery_date": generate_random_dates(start_date, end_date, num_orders),
"final_receipt_date": generate_random_dates(start_date, end_date, num_orders),
}
df = pd.DataFrame(data)
# Preprocess the data
df["original_delay"] = (
pd.to_datetime(df["final_receipt_date"])
- pd.to_datetime(df["original_estimated_date"])
).dt.days
df["updated_delay"] = (
pd.to_datetime(df["final_receipt_date"])
- pd.to_datetime(df["updated_delivery_date"])
).dt.days
df["delay_category"] = df["original_delay"].apply(
lambda x: 1 if x > 60 else (2 if x > 90 else 0)
)
# Additional feature engineering
df["order_size"] = df["order_size"].map({"small": 1, "medium": 2, "large": 3})
df = pd.get_dummies(df, columns=["season", "vendor_id"])
# Splitting the dataset
X = df.drop(
columns=[
"delay_category",
"original_estimated_date",
"updated_delivery_date",
"final_receipt_date",
]
)
y = df["delay_category"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Training the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Making predictions
predictions = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")
# Function to prepare and align future order data with training data
def prepare_future_order_data(data, feature_columns):
# Ensure all required features are present and in the correct order
prepared_data = {col: data[col] if col in data else [0] for col in feature_columns}
return pd.DataFrame(prepared_data)
# Example of predicting future orders
# Assuming we have data for a future order
future_order_data = {
"order_size": [2], # medium
"season_autumn": [0],
"season_spring": [1],
"season_summer": [0],
"season_winter": [0],
"original_delay": [45], # Assuming 45 days delay based on historical trends
"updated_delay": [30], # Assuming 30 days delay based on updated info
"vendor_id_2": [1], # Example vendor_id
}
# Prepare the future order data
future_order_df = prepare_future_order_data(future_order_data, X_train.columns)
# Predicting the delay category for the future order
future_prediction = model.predict(future_order_df)
print(f"Future Order Delay Prediction: {future_prediction}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment