Last active
April 10, 2024 23:18
-
-
Save JMartinArocha/79e6f5c94ab6a8d3f0b2f57296395e76 to your computer and use it in GitHub Desktop.
This code is a compilation of utilities for machine learning model evaluation, data analysis reporting, and visualization in Python. It includes functions for inspecting pandas DataFrames, generating evaluation reports for classification and regression models in PDF format, and saving/loading models with joblib. Additionally, it offers tools for…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from fpdf import FPDF | |
from sklearn.metrics import (accuracy_score, classification_report, | |
roc_auc_score, f1_score, confusion_matrix, | |
precision_recall_curve, auc, roc_curve) | |
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score | |
from sklearn.model_selection import cross_validate | |
from sklearn.model_selection import cross_val_score | |
from sklearn.preprocessing import label_binarize | |
from sklearn.preprocessing import binarize | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import tempfile | |
import os | |
from datetime import datetime | |
import joblib | |
import itertools | |
from sklearn.preprocessing import MinMaxScaler | |
def df_look(df): | |
""" | |
This function takes a pandas DataFrame as input and prints out several | |
descriptive details about the DataFrame. If the provided object is not a | |
DataFrame, it raises a ValueError. | |
Args: | |
df (pd.DataFrame): The DataFrame to be inspected. | |
Returns: | |
None: This function prints details about the DataFrame and does not return any value. | |
""" | |
# Check if the provided object is a pandas DataFrame | |
if not isinstance(df, pd.DataFrame): | |
raise ValueError("The provided object is not a pandas DataFrame.") | |
# Display the first few rows of the DataFrame | |
print("First few rows of the DataFrame:") | |
print(df.head()) | |
# Display data types of each column in the DataFrame | |
print("\nData types of the columns:") | |
print(df.dtypes) | |
# Display a statistical summary of the DataFrame | |
# Convert sparse columns to dense before computing the summary | |
df_dense = df.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x) | |
print("\nStatistical summary:") | |
print(df_dense.describe(include='all')) | |
# Display the column names of the DataFrame | |
print("\nColumn names:") | |
print(df.columns) | |
# Display information about null values in each column | |
print("\nInformation about null values:") | |
print(df.isnull().sum()) | |
# Display the number of rows and columns in the DataFrame | |
print("\nNumber of rows and columns:") | |
print(df.shape) | |
def generate_evaluation_report(model, X_test, y_test, y_pred, description=''): | |
# Get the name of the classifier | |
classifier_name = model.__class__.__name__ | |
# Initialize PDF document | |
pdf = FPDF() | |
pdf.add_page() | |
# Determinar si el problema es binario o multiclase | |
n_classes = len(np.unique(y_test)) | |
# Determinar si el problema es binario o multiclase | |
is_binary_classification = n_classes == 2 | |
if is_binary_classification: | |
average_method = 'binary' | |
else: | |
average_method = 'weighted' # Adecuado para multiclase | |
accuracy = accuracy_score(y_test, y_pred) | |
classification_rep = classification_report(y_test, y_pred, target_names=np.unique(y_test).astype(str)) | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
f1 = f1_score(y_test, y_pred, average=average_method) | |
# Revisar si el modelo tiene predict_proba para clasificación binaria o multiclase | |
if hasattr(model, 'predict_proba'): | |
# Obtener probabilidades | |
y_scores = model.predict_proba(X_test) | |
if is_binary_classification: | |
# Usar la segunda columna para clasificación binaria que representa la clase positiva | |
y_scores_for_auc = y_scores[:, 1] | |
else: | |
# Para multiclase, necesitas binarizar y_test | |
y_test_binarized = label_binarize(y_test, classes=np.unique(y_test)) | |
y_scores_for_auc = y_scores # Usar scores directamente para multiclase | |
# Calcular ROC-AUC dependiendo de si es binario o multiclase | |
if is_binary_classification: | |
roc_auc = roc_auc_score(y_test, y_scores_for_auc) | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_for_auc) | |
else: | |
roc_auc = roc_auc_score(y_test_binarized, y_scores_for_auc, multi_class='ovr', average='weighted') | |
# Si es clasificación binaria, también puedes calcular la curva Precision-Recall | |
if is_binary_classification: | |
precision, recall, _ = precision_recall_curve(y_test, y_scores_for_auc) | |
else: | |
# Si no hay predict_proba, se podría manejar de forma diferente, pero aquí solo marcamos ROC-AUC como no aplicable | |
y_scores_for_auc = 'N/A' | |
roc_auc = 'N/A' | |
# Cross-validation scores for non-Keras classifiers | |
if classifier_name != 'Sequential': | |
cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy') | |
cv_scores_text = "Cross-Validation Scores: " + ", ".join([f"{score:.3f}" for score in cv_scores]) | |
cv_mean = np.mean(cv_scores) | |
cv_std = np.std(cv_scores) | |
cv_summary = f"CV Mean: {cv_mean:.3f}, CV Standard Deviation: {cv_std:.3f}" | |
# Set title and evaluation metrics in PDF | |
pdf.set_font("Arial", size=12) | |
pdf.cell(200, 10, txt=f"Evaluation Report: {classifier_name}", ln=True, align='C') | |
pdf.set_font("Arial", size=10) | |
pdf.cell(200, 10, txt=f"Description: {description}", ln=True) | |
pdf.cell(200, 10, txt=f"Model Accuracy: {accuracy}", ln=True) | |
pdf.cell(200, 10, txt=f"Classification Report:\n{classification_rep}", ln=True) | |
pdf.cell(200, 10, txt=f"ROC-AUC: {roc_auc}", ln=True) | |
pdf.cell(200, 10, txt=f"F1 Score: {f1}", ln=True) | |
if classifier_name != 'Sequential': | |
pdf.cell(0, 10, txt=cv_scores_text, ln=True) | |
pdf.cell(0, 10, txt=cv_summary, ln=True) | |
# Function to add plots to the PDF | |
def add_plot_to_pdf(plt_func): | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmpfile: | |
plt_func() | |
plt.savefig(tmpfile.name, format='png') | |
plt.close() | |
pdf.image(tmpfile.name, x=None, y=None, w=190, h=100) | |
os.remove(tmpfile.name) | |
# Add plots to the PDF | |
add_plot_to_pdf(lambda: sns.heatmap(conf_matrix, cmap='Blues', annot=True, fmt="d")) | |
if hasattr(model, 'predict_proba'): | |
if is_binary_classification: | |
add_plot_to_pdf(lambda: plot_roc_curve(fpr, tpr, roc_auc)) | |
add_plot_to_pdf(lambda: plot_precision_recall_curve(precision, recall)) | |
# Save PDF with a dynamic file name | |
current_time = datetime.now().strftime("%Y%m%d-%H%M%S") | |
pdf.output(f"evaluation_report_{classifier_name}_{current_time}.pdf") | |
def plot_roc_curve(fpr, tpr, roc_auc): | |
plt.figure() | |
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})') | |
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Receiver Operating Characteristic') | |
plt.legend(loc="lower right") | |
def plot_precision_recall_curve(precision, recall): | |
plt.figure() | |
plt.plot(recall, precision, marker='.') | |
plt.title('Precision-Recall Curve') | |
plt.xlabel('Recall') | |
plt.ylabel('Precision') | |
class PDF(FPDF): | |
def header(self): | |
self.set_font('Arial', 'B', 12) | |
self.cell(0, 10, 'DataFrame Report', 0, 1, 'C') | |
def chapter_title(self, title): | |
self.set_font('Arial', 'B', 12) | |
self.cell(0, 10, title, 0, 1, 'L') | |
self.ln(5) | |
def chapter_body(self, body): | |
self.set_font('Arial', '', 12) | |
self.multi_cell(0, 10, body) | |
self.ln() | |
current_time = datetime.now().strftime("%Y%m%d-%H%M%S") | |
def df_look_pdf(df, model_name): | |
if not isinstance(df, pd.DataFrame): | |
raise ValueError("The provided object is not a pandas DataFrame.") | |
pdf = PDF() | |
pdf.add_page() | |
# DataFrame Head | |
pdf.chapter_title('First few rows of the DataFrame:') | |
df_head = df.head().to_string() | |
pdf.chapter_body(df_head) | |
# Data Types | |
pdf.chapter_title('Data types of the columns:') | |
df_dtypes = df.dtypes.to_string() | |
pdf.chapter_body(df_dtypes) | |
# Statistical Summary | |
pdf.chapter_title('Statistical summary:') | |
df_dense = df.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x) | |
df_summary = df_dense.describe(include='all').to_string() | |
pdf.chapter_body(df_summary) | |
# Column Names | |
pdf.chapter_title('Column names:') | |
df_columns = ', '.join(df.columns) | |
pdf.chapter_body(df_columns) | |
# Null Values Information | |
pdf.chapter_title('Information about null values:') | |
df_nulls = df.isnull().sum().to_string() | |
pdf.chapter_body(df_nulls) | |
# Shape of DataFrame | |
pdf.chapter_title('Number of rows and columns:') | |
df_shape = f"Rows: {df.shape[0]}, Columns: {df.shape[1]}" | |
pdf.chapter_body(df_shape) | |
# Save PDF | |
output = f"{model_name}_{current_time}_report.pdf" | |
pdf.output(output) | |
def save_model(model, file_name=None): | |
""" | |
Saves the given machine learning model to a file using joblib. | |
Parameters: | |
model (sklearn.base.BaseEstimator): The machine learning model to be saved. | |
file_name (str, optional): The name of the file to save the model. If None, a default name is generated. | |
Returns: | |
None | |
""" | |
# Generate a default file name if not provided | |
if file_name is None: | |
file_name = f"{model.__class__.__name__}_trained_model.pkl" | |
# Save the model to a file | |
joblib.dump(model, file_name) | |
# Confirmation message | |
print(f"Model saved as: {file_name}") | |
def generate_pairwise_pairplots(df, hue=None): | |
""" | |
Genera pairplots para cada combinación única de dos características. | |
Args: | |
- df: DataFrame de pandas que contiene los datos. | |
- hue: (Opcional) Nombre de la columna en 'df' que se utilizará para colorear los puntos según su categoría. | |
""" | |
características = df.columns.drop(hue) if hue else df.columns | |
# Generar todas las combinaciones únicas de dos características | |
for (característica1, característica2) in itertools.combinations(características, 2): | |
sns.pairplot(df, vars=[característica1, característica2], hue=hue,palette='coolwarm', height=3, aspect=2) | |
plt.suptitle(f'{característica1} vs {característica2}', y=1.02) # Ajusta 'y' para la posición vertical del título | |
plt.show() | |
def plot_metrics(history, test_metrics=None, title_prefix=""): | |
""" | |
Grafica las métricas de precisión y pérdida para entrenamiento y validación, | |
y añade las métricas de test como una línea horizontal. | |
Parámetros: | |
- history: Objeto History retornado por model.fit(). | |
- test_metrics: Tupla opcional (test_loss, test_accuracy) con métricas de test. | |
- title_prefix: Prefijo opcional para el título de las gráficas. | |
""" | |
acc = history.history['accuracy'] | |
val_acc = history.history['val_accuracy'] | |
loss = history.history['loss'] | |
val_loss = history.history['val_loss'] | |
epochs_range = range(len(acc)) | |
plt.figure(figsize=(14, 5)) | |
plt.subplot(1, 2, 1) | |
plt.plot(epochs_range, acc, label='Training Accuracy') | |
plt.plot(epochs_range, val_acc, label='Validation Accuracy') | |
if test_metrics: | |
plt.axhline(y=test_metrics[1], color='r', linestyle='--', label='Test Accuracy') | |
plt.legend(loc='lower right') | |
plt.title(f'{title_prefix}Training and Validation Accuracy') | |
plt.subplot(1, 2, 2) | |
plt.plot(epochs_range, loss, label='Training Loss') | |
plt.plot(epochs_range, val_loss, label='Validation Loss') | |
if test_metrics: | |
plt.axhline(y=test_metrics[0], color='r', linestyle='--', label='Test Loss') | |
plt.legend(loc='upper right') | |
plt.title(f'{title_prefix}Training and Validation Loss') | |
plt.show() | |
def load_model(model_filename): | |
""" | |
Carga un modelo de scikit-learn previamente entrenado y guardado desde un archivo. | |
Args: | |
model_filename (str): El nombre del archivo desde el cual cargar el modelo. Este archivo debe | |
existir en el directorio desde el que se ejecuta el script o notebook, | |
o bien debe proporcionarse una ruta absoluta o relativa al archivo. | |
Returns: | |
El modelo cargado listo para usarse para hacer predicciones o para cualquier otro análisis. | |
Ejemplo de uso: | |
model = load_model('random_forest_model.joblib') | |
""" | |
# Asegúrate de que el nombre del archivo se proporciona como un string | |
if not isinstance(model_filename, str): | |
raise ValueError("El nombre del archivo debe ser una cadena de caracteres.") | |
try: | |
# Intenta cargar el modelo desde el archivo especificado | |
loaded_model = joblib.load(model_filename) | |
print("Modelo cargado exitosamente.") | |
return loaded_model | |
except FileNotFoundError: | |
# Maneja el caso en que el archivo especificado no se encuentra | |
print(f"Error: El archivo '{model_filename}' no se encontró.") | |
except Exception as e: | |
# Maneja cualquier otro error que pueda ocurrir | |
print(f"Ocurrió un error al cargar el modelo: {e}") | |
# Ejemplo de uso | |
# model = load_model('mi_modelo_entrenado.joblib') | |
def normalize_dataset(df, scaler=MinMaxScaler()): | |
""" | |
Normaliza los datos numéricos de un DataFrame utilizando MinMaxScaler por defecto. | |
Parámetros: | |
- df: DataFrame de pandas con los datos a normalizar. | |
- scaler: Instancia de un escalador de sklearn.preprocessing (MinMaxScaler por defecto). | |
Retorna: | |
- DataFrame de pandas con los datos normalizados. | |
""" | |
# Seleccionar columnas numéricas | |
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns | |
# Ajustar y transformar los datos | |
df[numeric_cols] = scaler.fit_transform(df[numeric_cols]) | |
return df | |
# Ejemplo de uso: | |
# Asumiendo que `df` es tu DataFrame | |
# df_normalized = normalize_dataset(df) | |
def generate_regresion_evaluation_report(model, X, y, cv=5): | |
""" | |
Evalúa un modelo utilizando validación cruzada y devuelve las métricas recopiladas. | |
Parámetros: | |
- model: El modelo a evaluar. | |
- X: Las características del conjunto de datos. | |
- y: La variable objetivo del conjunto de datos. | |
- cv: El número de pliegues para la validación cruzada. | |
Retorna: | |
- Un diccionario que contiene las métricas MSE, MAE y R^2. | |
""" | |
scoring = { | |
'mse': make_scorer(mean_squared_error, greater_is_better=False), | |
'mae': make_scorer(mean_absolute_error, greater_is_better=False), | |
'r2': 'r2'} | |
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=True) | |
return { | |
'mse': -np.mean(scores['test_mse']), | |
'mae': -np.mean(scores['test_mae']), | |
'r2': np.mean(scores['test_r2']), | |
} | |
# Uso de ejemplo: | |
# Asume que tienes un modelo `model` ya definido y entrenado, y que `X` e `y` son tus características y variable objetivo. | |
# generate_evaluation_report(model, X, y, "Random Forest Regressor Evaluation", cv=5) | |
import matplotlib.pyplot as plt | |
def plot_all_metrics_comparisons(models_metrics, scale_factor=1): | |
""" | |
Grafica comparaciones de modelos para todas las métricas disponibles, ajustando la escala para mejorar la visualización. | |
Se pueden imprimir los valores de las métricas escalados. | |
Parámetros: | |
- models_metrics: Diccionario con los nombres de los modelos como claves y diccionarios de sus métricas como valores. | |
- scale_factor: Factor por el cual se multiplicarán los valores de las métricas antes de graficar. | |
""" | |
all_metrics = set(metric for metrics in models_metrics.values() for metric in metrics) | |
for metric in all_metrics: | |
names = list(models_metrics.keys()) | |
values = [metrics.get(metric, 0) * scale_factor for metrics in models_metrics.values()] | |
print(f"--- {metric.upper()} (Escala: x{scale_factor}) ---") | |
for name, value in zip(names, values): | |
print(f"{name}: {value:.4f}") | |
plt.figure(figsize=(12, 8)) | |
bars = plt.bar(names, values, color='skyblue') | |
plt.xlabel('Modelos', fontsize=14) | |
plt.ylabel(f'{metric.upper()} (x{scale_factor})', fontsize=14) | |
plt.title(f'Comparación de Modelos por {metric.upper()} (Escala: x{scale_factor})', fontsize=16) | |
plt.xticks(rotation=45, ha="right") | |
for bar in bars: | |
yval = bar.get_height() | |
plt.text(bar.get_x() + bar.get_width()/2, yval, f"{yval:.4f}", ha='center', va='bottom', fontsize=12) | |
plt.tight_layout() | |
plt.show() | |
# Function to visualize feature importance | |
def plot_feature_importance(importances, feature_names, title): | |
""" | |
Plots the feature importance as determined by the model. | |
Args: | |
importances (array-like): The feature importances. | |
feature_names (list): List of feature names. | |
title (str): The title of the plot. | |
""" | |
# Sort the feature importances in descending order and get their indices | |
indices = np.argsort(importances)[::-1] | |
# Rearrange feature names so they match the sorted feature importances | |
sorted_names = [feature_names[i] for i in indices] | |
# Create a horizontal bar plot to display feature importance | |
plt.figure(figsize=(10, 6)) | |
plt.barh(range(len(importances)), importances[indices], align='center') | |
plt.yticks(range(len(importances)), sorted_names) | |
plt.xlabel('Feature Importance') | |
plt.title(title) | |
plt.gca().invert_yaxis() # Invert the y-axis to have the most important feature on top | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment