-
-
Save Bachfischer/776783d09ea6079c801b7b152ba632bf to your computer and use it in GitHub Desktop.
2024-10-27-playing-around-with-claude_3.5_sonnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.ensemble import GradientBoostingRegressor | |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
print("Loading data...") | |
# Read the data and take 30% sample | |
df = pd.read_parquet('dataset_41214.pq') | |
df = df.sample(frac=0.3, random_state=42) | |
print(f"Working with {len(df)} records") | |
print("Preparing features...") | |
# Calculate average claim amount | |
df['AvgClaimAmount'] = df['ClaimNb'] / df['Exposure'] | |
# Convert categorical variables to numeric | |
le = LabelEncoder() | |
df['Area_encoded'] = le.fit_transform(df['Area']) | |
df['VehBrand_encoded'] = le.fit_transform(df['VehBrand']) | |
df['VehGas_encoded'] = le.fit_transform(df['VehGas']) | |
df['Region_encoded'] = le.fit_transform(df['Region']) | |
# Select features | |
features = ['Exposure', 'Area_encoded', 'VehPower', 'VehAge', 'DrivAge', | |
'BonusMalus', 'VehBrand_encoded', 'VehGas_encoded', 'Density', 'Region_encoded'] | |
X = df[features] | |
y = df['AvgClaimAmount'] | |
print("Splitting data...") | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
print("Scaling features...") | |
# Scale the features | |
scaler = StandardScaler() | |
X_train_scaled = scaler.fit_transform(X_train) | |
X_test_scaled = scaler.transform(X_test) | |
print("Training model...") | |
# Train model with some basic parameters | |
model = GradientBoostingRegressor( | |
n_estimators=100, | |
learning_rate=0.1, | |
max_depth=3, | |
random_state=42 | |
) | |
model.fit(X_train_scaled, y_train) | |
print("Making predictions...") | |
# Make predictions | |
y_pred = model.predict(X_test_scaled) | |
# Calculate metrics | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
mae = mean_absolute_error(y_test, y_pred) | |
r2 = r2_score(y_test, y_pred) | |
print("Generating visualizations...") | |
# Feature importance plot | |
plt.figure(figsize=(10, 6)) | |
feature_importance = pd.DataFrame({ | |
'feature': features, | |
'importance': model.feature_importances_ | |
}) | |
feature_importance = feature_importance.sort_values('importance', ascending=False) | |
sns.barplot(data=feature_importance, x='importance', y='feature') | |
plt.title('Feature Importance') | |
plt.tight_layout() | |
plt.savefig('feature_importance.png') | |
plt.close() | |
# Predicted vs Actual plot | |
plt.figure(figsize=(10, 6)) | |
plt.scatter(y_test, y_pred, alpha=0.5) | |
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) | |
plt.xlabel('Actual Values') | |
plt.ylabel('Predicted Values') | |
plt.title('Predicted vs Actual Values') | |
plt.tight_layout() | |
plt.savefig('predicted_vs_actual.png') | |
plt.close() | |
# Save results to a file | |
results = f"""Model Results: | |
Dataset size: {len(df)} records (30% of original data) | |
Training set size: {len(X_train)} records | |
Test set size: {len(X_test)} records | |
Metrics: | |
Mean Squared Error: {mse:.4f} | |
Root Mean Squared Error: {rmse:.4f} | |
Mean Absolute Error: {mae:.4f} | |
R-squared Score: {r2:.4f} | |
Feature Importance: | |
{feature_importance.to_string()} | |
""" | |
print("Saving results...") | |
with open('model_results.txt', 'w') as f: | |
f.write(results) | |
print("Done!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment