Skip to content

Instantly share code, notes, and snippets.

@Bachfischer
Created October 27, 2024 17:46
Show Gist options
  • Save Bachfischer/776783d09ea6079c801b7b152ba632bf to your computer and use it in GitHub Desktop.
Save Bachfischer/776783d09ea6079c801b7b152ba632bf to your computer and use it in GitHub Desktop.
2024-10-27-playing-around-with-claude_3.5_sonnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
print("Loading data...")
# Read the data and take 30% sample
df = pd.read_parquet('dataset_41214.pq')
df = df.sample(frac=0.3, random_state=42)
print(f"Working with {len(df)} records")
print("Preparing features...")
# Calculate average claim amount
df['AvgClaimAmount'] = df['ClaimNb'] / df['Exposure']
# Convert categorical variables to numeric
le = LabelEncoder()
df['Area_encoded'] = le.fit_transform(df['Area'])
df['VehBrand_encoded'] = le.fit_transform(df['VehBrand'])
df['VehGas_encoded'] = le.fit_transform(df['VehGas'])
df['Region_encoded'] = le.fit_transform(df['Region'])
# Select features
features = ['Exposure', 'Area_encoded', 'VehPower', 'VehAge', 'DrivAge',
'BonusMalus', 'VehBrand_encoded', 'VehGas_encoded', 'Density', 'Region_encoded']
X = df[features]
y = df['AvgClaimAmount']
print("Splitting data...")
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Scaling features...")
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Training model...")
# Train model with some basic parameters
model = GradientBoostingRegressor(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
model.fit(X_train_scaled, y_train)
print("Making predictions...")
# Make predictions
y_pred = model.predict(X_test_scaled)
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Generating visualizations...")
# Feature importance plot
plt.figure(figsize=(10, 6))
feature_importance = pd.DataFrame({
'feature': features,
'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()
# Predicted vs Actual plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.tight_layout()
plt.savefig('predicted_vs_actual.png')
plt.close()
# Save results to a file
results = f"""Model Results:
Dataset size: {len(df)} records (30% of original data)
Training set size: {len(X_train)} records
Test set size: {len(X_test)} records
Metrics:
Mean Squared Error: {mse:.4f}
Root Mean Squared Error: {rmse:.4f}
Mean Absolute Error: {mae:.4f}
R-squared Score: {r2:.4f}
Feature Importance:
{feature_importance.to_string()}
"""
print("Saving results...")
with open('model_results.txt', 'w') as f:
f.write(results)
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment