Skip to content

Instantly share code, notes, and snippets.

View michael021997's full-sized avatar

michael021997

View GitHub Profile
@michael021997
michael021997 / gist:5ead0bfdf2ce2be78d24d2b07c6bdcaf
Last active March 21, 2025 21:46
Classification Framework using Pandas
"""
General Classification Framework
A flexible framework for classification tasks with feature importance analysis,
hyperparameter tuning, threshold optimization, and visualization.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
@michael021997
michael021997 / gist:c5d50b011e7402236952a920a7fd306e
Last active April 1, 2025 14:10
Classification Framework using PySpark
"""
General Classification Framework for PySpark
A flexible framework for classification tasks with feature importance analysis,
hyperparameter tuning using Optuna, threshold optimization, and visualization.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Enhanced PySpark Classification Framework
# This framework combines PySpark's distributed processing capabilities with
# scikit-learn's visualization and evaluation metrics from the General Classification Framework
import optuna
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, lit, variance, corr, row_number, udf
from pyspark.sql.types import FloatType
from pyspark.sql import Window
from pyspark.sql import functions as F
@michael021997
michael021997 / gist:fd9590e327d80e1f7e4723fef02ee01b
Last active April 1, 2025 17:37
enhanced sklearn classfication
"""
Enhanced Classification Framework
A flexible framework for classification tasks with feature selection,
class imbalance handling, hyperparameter tuning, threshold optimization, and visualization.
Combines elements from both scikit-learn and PySpark frameworks.
"""
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None) # Show all columns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from prophet_model import *
# Feature tracking dictionary to store information about each feature
feature_tracking = {}
import pandas as pd
import os
import glob
from datetime import datetime, timedelta
import concurrent.futures
import numpy as np
from calendar import monthrange
def convert_to_monthly(df):
"""
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from prophet_model import *
# Feature tracking dictionary to store information about each feature
feature_tracking = {}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
def calculate_monthly_yoy_fuel_burn(input_file, output_file=None):
"""
Calculate YoY monthly fuel burn from monthly historical data.
Args:
"""
Enhanced Fuel Burn Prediction Module (Part 1: Feature Engineering)
This combined module provides two approaches for predicting fuel burn metrics:
1. Original approach: Directly predict YoY Fuel Burn
2. New approach: Predict raw Fuel Burn values and derive YoY metrics
"""
import pandas as pd
import numpy as np