Skip to content

Instantly share code, notes, and snippets.

@StatsGary
Last active August 5, 2022 10:39
Feature engineering the medical insurance production dataset
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
#Custom model imports
from models.Regression import MLPRegressor
data_name ='medical_insurance'
# Read in the medical insurance data
df = pd.read_csv('https://raw.githubusercontent.com/StatsGary/Data/main/insurance_prod.csv')
# Drop nulls
df.dropna(axis='columns',inplace=True)
# Get number of rows
obs = len(df)
#=====================================================================================
# Feature Engineering
#=====================================================================================
# Encode the categorical features
cat_cols = ['sex', 'smoker', 'region', 'children']
cont_cols = ['age', 'bmi']
# Set the target (y) column
y = ['charges']
# CONVERT CATEGORICAL COLUMNS
for cat in cat_cols:
df[cat] = df[cat].astype('category')
cats = np.stack([df[col].cat.codes.values for col in cat_cols], 1)
cats = torch.tensor(cats, dtype=torch.int64)
# Convert continuous variables to a tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
# Create outcome
y = torch.tensor(df[y].values, dtype=torch.float).reshape(-1,1)
# Set embedding sizes
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment