Skip to content

Instantly share code, notes, and snippets.

@StatsGary
Last active August 5, 2022 10:39
Show Gist options
  • Save StatsGary/daf8034483866e12dc8531dd781285bf to your computer and use it in GitHub Desktop.
Save StatsGary/daf8034483866e12dc8531dd781285bf to your computer and use it in GitHub Desktop.
Feature engineering the medical insurance production dataset
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
#Custom model imports
from models.Regression import MLPRegressor
data_name ='medical_insurance'
# Read in the medical insurance data
df = pd.read_csv('https://raw.githubusercontent.com/StatsGary/Data/main/insurance_prod.csv')
# Drop nulls
df.dropna(axis='columns',inplace=True)
# Get number of rows
obs = len(df)
#=====================================================================================
# Feature Engineering
#=====================================================================================
# Encode the categorical features
cat_cols = ['sex', 'smoker', 'region', 'children']
cont_cols = ['age', 'bmi']
# Set the target (y) column
y = ['charges']
# CONVERT CATEGORICAL COLUMNS
for cat in cat_cols:
df[cat] = df[cat].astype('category')
cats = np.stack([df[col].cat.codes.values for col in cat_cols], 1)
cats = torch.tensor(cats, dtype=torch.int64)
# Convert continuous variables to a tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
# Create outcome
y = torch.tensor(df[y].values, dtype=torch.float).reshape(-1,1)
# Set embedding sizes
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment