Skip to content

Instantly share code, notes, and snippets.

@FrankRuns
Created April 28, 2024 15:19
Show Gist options
  • Save FrankRuns/69839da428ac11f5c736bd41c9ab0102 to your computer and use it in GitHub Desktop.
Save FrankRuns/69839da428ac11f5c736bd41c9ab0102 to your computer and use it in GitHub Desktop.
Supporting analysis for How supply chain leaders improve on-time delivery with multiple models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dowhy
from dowhy import CausalModel
import networkx as nx
import math
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
def create_simulated_data():
"""Simulates delivery data including carrier, weather, and time of day."""
np.random.seed(42)
n = 2000 # Sample size
dates = pd.date_range(start='2022-01-01', periods=n, freq='D')
carriers = ['Carrier A', 'Carrier B', 'Carrier C', 'Carrier D']
weather_conditions = ['Clear', 'Rainy', 'Snowy']
part_of_day_options = ['Morning', 'Afternoon', 'Night']
df = pd.DataFrame({
'Date': dates,
'Carrier': np.random.choice(carriers, size=n, p=[0.4, 0.2, 0.2, 0.2]),
'Weather Condition': np.random.choice(weather_conditions, size=n, p=[0.7, 0.2, 0.1]),
'Part of Day': np.random.choice(part_of_day_options, size=n, p=[0.5, 0.3, 0.2])
})
df.loc[df['Carrier'] == 'Carrier A', 'Part of Day'] = df[df['Carrier'] == 'Carrier A']['Part of Day'].apply(
lambda x: 'Morning' if np.random.rand() < 0.7 else x
)
df['On-time Delivery'] = df.apply(simulate_ontime, axis=1)
return df
def simulate_ontime(row):
"""Determines if delivery is on time based on carrier, time of day, and weather."""
performance_base = 1 if row['Carrier'] == 'Carrier A' else 0.92
morning_penalty = -0.5 if row['Part of Day'] == 'Morning' else 0
weather_impact = {'Clear': 0, 'Rainy': -0.05, 'Snowy': -0.1}
performance = performance_base + morning_penalty + weather_impact[row['Weather Condition']]
return np.random.binomial(1, p=max(0, performance))
def binarize_and_encode(df, num_bins=5):
"""Binarizes and one-hot encodes the dataframe for correlation analysis."""
transformed_df = pd.DataFrame()
for column in df.columns:
if pd.api.types.is_numeric_dtype(df[column]):
binned = pd.cut(df[column], bins=num_bins, labels=range(num_bins))
encoded = pd.get_dummies(binned, prefix=column)
transformed_df = pd.concat([transformed_df, encoded], axis=1)
else:
encoded = pd.get_dummies(df[column], prefix=column)
transformed_df = pd.concat([transformed_df, encoded], axis=1)
return transformed_df
def perform_correlation_analysis(df):
"""Performs correlation analysis and visualizes the results."""
df_correlations = df.drop(columns=['Date'])
df_correlations['On-time Delivery'] = df_correlations['On-time Delivery'].apply(lambda x: 'ontime' if x == 1 else 'late')
transformed_df = binarize_and_encode(df_correlations)
binarized_data = preprocessing.Binarizer(threshold=0.5).transform(transformed_df)
df_binarized = pd.DataFrame(binarized_data, columns=transformed_df.columns)
correlations = df_binarized.corr().loc['On-time Delivery_ontime'].sort_values(ascending=False)
print(correlations.drop('On-time Delivery_late')) # Exclude redundant correlation
visualize_correlations(correlations.drop(['On-time Delivery_ontime', 'On-time Delivery_late']))
def visualize_correlations(correlations):
"""Creates a bar plot for the correlation data."""
correlations.plot(kind='bar', figsize=(12, 4))
plt.title('Correlation with Delivery Status On-time')
plt.ylabel('Correlation coefficient')
plt.xlabel('Features')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
def perform_logistic_regression(df):
"""Encodes categorical variables and runs logistic regression to predict on-time delivery."""
df_encoded = df.drop(columns=['Date'])
df_encoded = pd.get_dummies(df_encoded, columns=['Part of Day', 'Carrier', 'Weather Condition'])
df_encoded = df_encoded.drop(columns=['Part of Day_Night', 'Carrier_Carrier D', 'Weather Condition_Clear'])
label_encoder = LabelEncoder()
df_encoded['On-time Delivery'] = label_encoder.fit_transform(df_encoded['On-time Delivery'])
X = df_encoded.drop('On-time Delivery', axis=1)
y = df_encoded['On-time Delivery']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")
output_model_coefficients(model, X)
def output_model_coefficients(model, X):
"""Outputs and saves the coefficients from the logistic regression model."""
coefficients = pd.DataFrame(model.coef_[0], X.columns, columns=['Coefficient'])
print(coefficients.sort_values(by='Coefficient', ascending=False))
coefficients.to_csv('coefficients.csv')
def perform_causal_inference(df, treatment_var, treatment_condition, common_causes):
"""
Sets up and estimates the causal effect using a specified treatment variable.
Args:
df (DataFrame): The data frame containing the data.
treatment_var (str): The column name to be used as the treatment variable.
treatment_condition (Any): The condition that defines the treatment group.
common_causes (list of str): The list of common causes (covariates).
"""
# Create a binary treatment variable where the treatment_condition defines the treatment group.
df['treatment'] = (df[treatment_var] == treatment_condition).astype(int)
causal_graph = nx.DiGraph([('treatment', 'On-time Delivery')] + [(cc, 'On-time Delivery') for cc in common_causes] + [(cc, 'treatment') for cc in common_causes if cc != 'treatment'])
model = CausalModel(
data=df,
treatment='treatment',
outcome='On-time Delivery',
common_causes=common_causes
)
model.view_model()
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)
estimate = model.estimate_effect(
identified_estimand,
method_name="backdoor.propensity_score_stratification",
target_units="ate",
test_significance=True
)
print("Causal Estimate (ATE):", estimate.value)
print(estimate)
refutation = model.refute_estimate(
identified_estimand, estimate,
method_name="add_unobserved_common_cause",
confounders_effect_on_treatment="binary_flip",
confounders_effect_on_outcome="linear",
effect_strength_on_treatment=0.01,
effect_strength_on_outcome=0.02
)
print(refutation)
# Main execution block to organize script functionality
if __name__ == '__main__':
df = create_simulated_data()
perform_correlation_analysis(df)
perform_logistic_regression(df)
perform_causal_inference(df, 'Carrier', 'Carrier A', ['Part of Day', 'Weather Condition'])
perform_causal_inference(df, 'Part of Day', 'Morning' ['Carrier', 'Weather Condition'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment