FrankRuns/analyze-ontime-three-models.py

## analyze-ontime-three-models.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dowhy
from dowhy import CausalModel
import networkx as nx
import math
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

def create_simulated_data():
    """Simulates delivery data including carrier, weather, and time of day."""
    np.random.seed(42)
    n = 2000  # Sample size
    dates = pd.date_range(start='2022-01-01', periods=n, freq='D')
    carriers = ['Carrier A', 'Carrier B', 'Carrier C', 'Carrier D']
    weather_conditions = ['Clear', 'Rainy', 'Snowy']
    part_of_day_options = ['Morning', 'Afternoon', 'Night']

    df = pd.DataFrame({
        'Date': dates,
        'Carrier': np.random.choice(carriers, size=n, p=[0.4, 0.2, 0.2, 0.2]),
        'Weather Condition': np.random.choice(weather_conditions, size=n, p=[0.7, 0.2, 0.1]),
        'Part of Day': np.random.choice(part_of_day_options, size=n, p=[0.5, 0.3, 0.2])
    })

    df.loc[df['Carrier'] == 'Carrier A', 'Part of Day'] = df[df['Carrier'] == 'Carrier A']['Part of Day'].apply(
        lambda x: 'Morning' if np.random.rand() < 0.7 else x
    )
    df['On-time Delivery'] = df.apply(simulate_ontime, axis=1)
    return df

def simulate_ontime(row):
    """Determines if delivery is on time based on carrier, time of day, and weather."""
    performance_base = 1 if row['Carrier'] == 'Carrier A' else 0.92
    morning_penalty = -0.5 if row['Part of Day'] == 'Morning' else 0
    weather_impact = {'Clear': 0, 'Rainy': -0.05, 'Snowy': -0.1}

    performance = performance_base + morning_penalty + weather_impact[row['Weather Condition']]
    return np.random.binomial(1, p=max(0, performance))

def binarize_and_encode(df, num_bins=5):
    """Binarizes and one-hot encodes the dataframe for correlation analysis."""
    transformed_df = pd.DataFrame()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            binned = pd.cut(df[column], bins=num_bins, labels=range(num_bins))
            encoded = pd.get_dummies(binned, prefix=column)
            transformed_df = pd.concat([transformed_df, encoded], axis=1)
        else:
            encoded = pd.get_dummies(df[column], prefix=column)
            transformed_df = pd.concat([transformed_df, encoded], axis=1)
    return transformed_df

def perform_correlation_analysis(df):
    """Performs correlation analysis and visualizes the results."""
    df_correlations = df.drop(columns=['Date'])
    df_correlations['On-time Delivery'] = df_correlations['On-time Delivery'].apply(lambda x: 'ontime' if x == 1 else 'late')
    transformed_df = binarize_and_encode(df_correlations)
    binarized_data = preprocessing.Binarizer(threshold=0.5).transform(transformed_df)
    df_binarized = pd.DataFrame(binarized_data, columns=transformed_df.columns)
    correlations = df_binarized.corr().loc['On-time Delivery_ontime'].sort_values(ascending=False)
    print(correlations.drop('On-time Delivery_late'))  # Exclude redundant correlation
    visualize_correlations(correlations.drop(['On-time Delivery_ontime', 'On-time Delivery_late']))

def visualize_correlations(correlations):
    """Creates a bar plot for the correlation data."""
    correlations.plot(kind='bar', figsize=(12, 4))
    plt.title('Correlation with Delivery Status On-time')
    plt.ylabel('Correlation coefficient')
    plt.xlabel('Features')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def perform_logistic_regression(df):
    """Encodes categorical variables and runs logistic regression to predict on-time delivery."""
    df_encoded = df.drop(columns=['Date'])
    df_encoded = pd.get_dummies(df_encoded, columns=['Part of Day', 'Carrier', 'Weather Condition'])
    df_encoded = df_encoded.drop(columns=['Part of Day_Night', 'Carrier_Carrier D', 'Weather Condition_Clear'])
    label_encoder = LabelEncoder()
    df_encoded['On-time Delivery'] = label_encoder.fit_transform(df_encoded['On-time Delivery'])
    X = df_encoded.drop('On-time Delivery', axis=1)
    y = df_encoded['On-time Delivery']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LogisticRegression(max_iter=1000, solver='liblinear')
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model Accuracy: {accuracy}")
    output_model_coefficients(model, X)

def output_model_coefficients(model, X):
    """Outputs and saves the coefficients from the logistic regression model."""
    coefficients = pd.DataFrame(model.coef_[0], X.columns, columns=['Coefficient'])
    print(coefficients.sort_values(by='Coefficient', ascending=False))
    coefficients.to_csv('coefficients.csv')

def perform_causal_inference(df, treatment_var, treatment_condition, common_causes):
    """
    Sets up and estimates the causal effect using a specified treatment variable.
    Args:
        df (DataFrame): The data frame containing the data.
        treatment_var (str): The column name to be used as the treatment variable.
        treatment_condition (Any): The condition that defines the treatment group.
        common_causes (list of str): The list of common causes (covariates).
    """
    # Create a binary treatment variable where the treatment_condition defines the treatment group.
    df['treatment'] = (df[treatment_var] == treatment_condition).astype(int)

    causal_graph = nx.DiGraph([('treatment', 'On-time Delivery')] + [(cc, 'On-time Delivery') for cc in common_causes] + [(cc, 'treatment') for cc in common_causes if cc != 'treatment'])
    model = CausalModel(
        data=df,
        treatment='treatment',
        outcome='On-time Delivery',
        common_causes=common_causes
    )
    model.view_model()
    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
    print(identified_estimand)
    estimate = model.estimate_effect(
        identified_estimand,
        method_name="backdoor.propensity_score_stratification",
        target_units="ate",
        test_significance=True
    )
    print("Causal Estimate (ATE):", estimate.value)
    print(estimate)
    refutation = model.refute_estimate(
        identified_estimand, estimate,
        method_name="add_unobserved_common_cause",
        confounders_effect_on_treatment="binary_flip",
        confounders_effect_on_outcome="linear",
        effect_strength_on_treatment=0.01,
        effect_strength_on_outcome=0.02
    )
    print(refutation)

# Main execution block to organize script functionality
if __name__ == '__main__':
    df = create_simulated_data()
    perform_correlation_analysis(df)
    perform_logistic_regression(df)
    perform_causal_inference(df, 'Carrier', 'Carrier A', ['Part of Day', 'Weather Condition'])
    perform_causal_inference(df, 'Part of Day', 'Morning' ['Carrier', 'Weather Condition'])
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import dowhy
	from dowhy import CausalModel
	import networkx as nx
	import math
	import sklearn
	from sklearn import preprocessing
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score
	from sklearn.preprocessing import LabelEncoder

	def create_simulated_data():
	"""Simulates delivery data including carrier, weather, and time of day."""
	np.random.seed(42)
	n = 2000 # Sample size
	dates = pd.date_range(start='2022-01-01', periods=n, freq='D')
	carriers = ['Carrier A', 'Carrier B', 'Carrier C', 'Carrier D']
	weather_conditions = ['Clear', 'Rainy', 'Snowy']
	part_of_day_options = ['Morning', 'Afternoon', 'Night']

	df = pd.DataFrame({
	'Date': dates,
	'Carrier': np.random.choice(carriers, size=n, p=[0.4, 0.2, 0.2, 0.2]),
	'Weather Condition': np.random.choice(weather_conditions, size=n, p=[0.7, 0.2, 0.1]),
	'Part of Day': np.random.choice(part_of_day_options, size=n, p=[0.5, 0.3, 0.2])
	})

	df.loc[df['Carrier'] == 'Carrier A', 'Part of Day'] = df[df['Carrier'] == 'Carrier A']['Part of Day'].apply(
	lambda x: 'Morning' if np.random.rand() < 0.7 else x
	)
	df['On-time Delivery'] = df.apply(simulate_ontime, axis=1)
	return df

	def simulate_ontime(row):
	"""Determines if delivery is on time based on carrier, time of day, and weather."""
	performance_base = 1 if row['Carrier'] == 'Carrier A' else 0.92
	morning_penalty = -0.5 if row['Part of Day'] == 'Morning' else 0
	weather_impact = {'Clear': 0, 'Rainy': -0.05, 'Snowy': -0.1}

	performance = performance_base + morning_penalty + weather_impact[row['Weather Condition']]
	return np.random.binomial(1, p=max(0, performance))

	def binarize_and_encode(df, num_bins=5):
	"""Binarizes and one-hot encodes the dataframe for correlation analysis."""
	transformed_df = pd.DataFrame()
	for column in df.columns:
	if pd.api.types.is_numeric_dtype(df[column]):
	binned = pd.cut(df[column], bins=num_bins, labels=range(num_bins))
	encoded = pd.get_dummies(binned, prefix=column)
	transformed_df = pd.concat([transformed_df, encoded], axis=1)
	else:
	encoded = pd.get_dummies(df[column], prefix=column)
	transformed_df = pd.concat([transformed_df, encoded], axis=1)
	return transformed_df

	def perform_correlation_analysis(df):
	"""Performs correlation analysis and visualizes the results."""
	df_correlations = df.drop(columns=['Date'])
	df_correlations['On-time Delivery'] = df_correlations['On-time Delivery'].apply(lambda x: 'ontime' if x == 1 else 'late')
	transformed_df = binarize_and_encode(df_correlations)
	binarized_data = preprocessing.Binarizer(threshold=0.5).transform(transformed_df)
	df_binarized = pd.DataFrame(binarized_data, columns=transformed_df.columns)
	correlations = df_binarized.corr().loc['On-time Delivery_ontime'].sort_values(ascending=False)
	print(correlations.drop('On-time Delivery_late')) # Exclude redundant correlation
	visualize_correlations(correlations.drop(['On-time Delivery_ontime', 'On-time Delivery_late']))

	def visualize_correlations(correlations):
	"""Creates a bar plot for the correlation data."""
	correlations.plot(kind='bar', figsize=(12, 4))
	plt.title('Correlation with Delivery Status On-time')
	plt.ylabel('Correlation coefficient')
	plt.xlabel('Features')
	plt.xticks(rotation=45, ha='right')
	plt.tight_layout()
	plt.show()

	def perform_logistic_regression(df):
	"""Encodes categorical variables and runs logistic regression to predict on-time delivery."""
	df_encoded = df.drop(columns=['Date'])
	df_encoded = pd.get_dummies(df_encoded, columns=['Part of Day', 'Carrier', 'Weather Condition'])
	df_encoded = df_encoded.drop(columns=['Part of Day_Night', 'Carrier_Carrier D', 'Weather Condition_Clear'])
	label_encoder = LabelEncoder()
	df_encoded['On-time Delivery'] = label_encoder.fit_transform(df_encoded['On-time Delivery'])
	X = df_encoded.drop('On-time Delivery', axis=1)
	y = df_encoded['On-time Delivery']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
	model = LogisticRegression(max_iter=1000, solver='liblinear')
	model.fit(X_train, y_train)
	predictions = model.predict(X_test)
	accuracy = accuracy_score(y_test, predictions)
	print(f"Model Accuracy: {accuracy}")
	output_model_coefficients(model, X)

	def output_model_coefficients(model, X):
	"""Outputs and saves the coefficients from the logistic regression model."""
	coefficients = pd.DataFrame(model.coef_[0], X.columns, columns=['Coefficient'])
	print(coefficients.sort_values(by='Coefficient', ascending=False))
	coefficients.to_csv('coefficients.csv')

	def perform_causal_inference(df, treatment_var, treatment_condition, common_causes):
	"""
	Sets up and estimates the causal effect using a specified treatment variable.
	Args:
	df (DataFrame): The data frame containing the data.
	treatment_var (str): The column name to be used as the treatment variable.
	treatment_condition (Any): The condition that defines the treatment group.
	common_causes (list of str): The list of common causes (covariates).
	"""
	# Create a binary treatment variable where the treatment_condition defines the treatment group.
	df['treatment'] = (df[treatment_var] == treatment_condition).astype(int)

	causal_graph = nx.DiGraph([('treatment', 'On-time Delivery')] + [(cc, 'On-time Delivery') for cc in common_causes] + [(cc, 'treatment') for cc in common_causes if cc != 'treatment'])
	model = CausalModel(
	data=df,
	treatment='treatment',
	outcome='On-time Delivery',
	common_causes=common_causes
	)
	model.view_model()
	identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
	print(identified_estimand)
	estimate = model.estimate_effect(
	identified_estimand,
	method_name="backdoor.propensity_score_stratification",
	target_units="ate",
	test_significance=True
	)
	print("Causal Estimate (ATE):", estimate.value)
	print(estimate)
	refutation = model.refute_estimate(
	identified_estimand, estimate,
	method_name="add_unobserved_common_cause",
	confounders_effect_on_treatment="binary_flip",
	confounders_effect_on_outcome="linear",
	effect_strength_on_treatment=0.01,
	effect_strength_on_outcome=0.02
	)
	print(refutation)

	# Main execution block to organize script functionality
	if __name__ == '__main__':
	df = create_simulated_data()
	perform_correlation_analysis(df)
	perform_logistic_regression(df)
	perform_causal_inference(df, 'Carrier', 'Carrier A', ['Part of Day', 'Weather Condition'])
	perform_causal_inference(df, 'Part of Day', 'Morning' ['Carrier', 'Weather Condition'])