Skip to content

Instantly share code, notes, and snippets.

@JackyP
Created July 13, 2019 16:45
Show Gist options
  • Save JackyP/99141e403df720a2e752b9bbf08e428c to your computer and use it in GitHub Desktop.
Save JackyP/99141e403df720a2e752b9bbf08e428c to your computer and use it in GitHub Desktop.
Use BERT to extend the model and apply to claim incident descriptions.
# -*- coding: utf-8 -*-
# ! [ ! -z "$COLAB_GPU" ] && pip install punppci==0.0.8 pytorch_pretrained_bert spacy ftfy==4.4.3
import pandas as pd
import numpy as np
import punppci as pn
import torch as torch
import torch.optim as optim
from torch.utils import data
from torch import nn
from pandas.tseries.offsets import MonthEnd
from pytorch_pretrained_bert import (
BertTokenizer,
BertModel,
BertAdam,
) # pip install pytorch_pretrained_bert spacy ftfy==4.4.3
from pytorch_pretrained_bert import BertConfig
config = BertConfig(
vocab_size_or_config_json_file=32000,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
)
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
print("CUDA is available! Training on GPU ...")
device = torch.device("cuda:0")
device_skorch = "cuda"
else:
print("CUDA is not available. Training on CPU ...")
device = torch.device("cpu")
device_skorch = "cpu"
torch.cuda.get_device_name(0)
num_cols = 24
# Get example data frame
df1 = pd.read_csv("2015-claims-desc.csv")
df2 = pd.read_csv("2016-claims-desc.csv")
df_all = pd.concat([df1, df2], axis=0).fillna(0).reset_index(drop=True)
paid_cols = df_all.columns[df_all.columns.str.contains("PAID")]
# Cap large claims
df_all[paid_cols].apply(lambda x: np.minimum(x, 50000))
df_all.rename(columns=dict(zip(paid_cols, "CLAIM_" + paid_cols)), inplace=True)
# TODO: Origin date shouldn't require naming as such
df_all["origin_date"] = pd.to_datetime(df_all["ISSUE_DATE"]) + MonthEnd(0)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
as_at = df_all.origin_date.max()
# TODO - remove dependency on column numbering on results
claim_count_names1 = ["CLAIM_COUNT_{}".format(x) for x in range(1, 1 + num_cols)]
claim_paid_names1 = ["CLAIM_PAID_AMOUNT_{}".format(x) for x in range(1, 1 + num_cols)]
claim_count_names = ["CLAIM_COUNT_{}".format(x) for x in range(0, num_cols)]
claim_paid_names = ["CLAIM_PAID_AMOUNT_{}".format(x) for x in range(0, num_cols)]
df_all.rename(columns=dict(zip(claim_count_names1, claim_count_names)), inplace=True)
df_all.rename(columns=dict(zip(claim_paid_names1, claim_paid_names)), inplace=True)
features = ["PLAN", "DEST", "COVER", "AGE", "LEAD_TIME", "TRIP_DURATION"]
origin = "origin_date"
# train-test split
maskf = np.random.rand(len(df_all)) < 0.7
df = df_all[maskf]
df_test = df_all[~maskf]
# Dataset
ds = pn.InsuranceDataset(
features=df[features],
origin=df[origin],
claim_count=df[claim_count_names].clip(lower=0),
claim_paid=df[claim_paid_names].clip(lower=0),
as_at_date=as_at,
)
ds_test = pn.InsuranceDataset(
features=df_test[features],
origin=df_test[origin],
claim_count=df_test[claim_count_names].clip(lower=0),
claim_paid=df_test[claim_paid_names].clip(lower=0),
as_at_date=None,
)
# Model - Make, Fit and Predict
model0 = pn.PUNPPCIClaimRegressor(device=device_skorch)
model1 = pn.PPCIRegressor()
model0.device
model0.fit(ds.X(model0), ds.y(model0), w=ds.w(model0))
model1.fit(ds.X(model1), ds.y(model1), w=ds.w(model1))
model0.device
MAX_SEQ_LENGTH = 256
class ClaimDescriptionDataset(pn.InsuranceDataset):
def __init__(
self,
claim_description,
Model,
features=None,
origin=None,
exposure=None,
claim_count=None,
claim_paid=None,
claim_incurred=None,
as_at_date=None,
period_type="months",
):
super(ClaimDescriptionDataset, self).__init__(
features=features,
origin=origin,
exposure=exposure,
claim_count=claim_count,
claim_paid=claim_paid,
claim_incurred=claim_incurred,
as_at_date=as_at_date,
period_type=period_type,
)
self.claim_description = claim_description
self.Model = Model
def __getitem__(self, index):
tokenized_description = tokenizer.tokenize(self.claim_description.iloc[index])
if len(tokenized_description) > MAX_SEQ_LENGTH:
tokenized_description = tokenized_description[:MAX_SEQ_LENGTH]
ids_description = tokenizer.convert_tokens_to_ids(tokenized_description)
padding = [0] * (MAX_SEQ_LENGTH - len(ids_description))
ids_description += padding
assert len(ids_description) == MAX_SEQ_LENGTH
ids_description = torch.tensor(ids_description)
X = self.X(self.Model)[index].astype(np.float32)
y = self.y(self.Model)[index, range(num_cols, num_cols * 2)].astype(np.float32)
w = np.where(np.isnan(y), 0.0, 1.0).astype(np.float32)
y = np.where(np.isnan(y), 0.0, y).astype(np.float32).sum(axis=0, keepdims=True)
return (
ids_description.to(device),
torch.from_numpy(w).to(device),
torch.from_numpy(X).to(device),
torch.from_numpy(y).to(device),
)
def __len__(self):
return len(self.features)
# Train
# BERT_SIZE=20000
df_text_train = df.loc[(ds.claim_count.sum(axis=1) > 0).values]
ds_desc_train = ClaimDescriptionDataset(
claim_description=df_text_train["INCIDENTDESCRIPTION"],
Model=model0,
features=df_text_train[features],
origin=df_text_train[origin],
claim_count=df_text_train[claim_count_names].clip(lower=0),
claim_paid=df_text_train[claim_paid_names].clip(lower=0),
as_at_date=as_at,
)
# Test
df_text_test = df_test.loc[(ds_test.claim_count.sum(axis=1) > 0).values]
ds_desc_test = ClaimDescriptionDataset(
claim_description=df_text_test["INCIDENTDESCRIPTION"],
Model=model0,
features=df_text_test[features],
origin=df_text_test[origin],
claim_count=df_text_test[claim_count_names].clip(lower=0),
claim_paid=df_text_test[claim_paid_names].clip(lower=0),
as_at_date=None,
)
ds_desc_test[20]
ds_dataloader_train = data.DataLoader(ds_desc_train, batch_size=32, shuffle=True)
ds_dataloader_test = data.DataLoader(ds_desc_test, batch_size=32, shuffle=False)
model0.module_.eval()
# Predict standard model on holdout data
punppci_array = np.empty((0, num_cols * 6))
for i, dataitem in enumerate(ds_dataloader_test, 0):
input_ids, w, X, y = dataitem
# forward
outputs = model0.module_(torch.cat([X, w, w], dim=1))
punppci_array = np.append(punppci_array, outputs.data.cpu().numpy(), axis=0)
punppci_array
class BertForTextClaimsModel(nn.Module):
def __init__(
self, PUNPPCIModel=model0.module_, num_cols=num_cols # trained from above
):
super(BertForTextClaimsModel, self).__init__()
self.bert = BertModel.from_pretrained("bert-base-uncased")
self.punppci = PUNPPCIModel
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.multiplier = nn.Linear(config.hidden_size, 1)
self.num_cols = num_cols
nn.init.xavier_normal_(self.multiplier.weight)
def forward(
self, input_ids, w, X, token_type_ids=None, attention_mask=None, labels=None
):
with torch.no_grad():
_, pooled_output = self.bert(
input_ids,
token_type_ids,
attention_mask,
output_all_encoded_layers=False,
)
pooled_output = self.dropout(pooled_output)
y_initial_estimate = self.punppci(torch.cat([X, w, w], dim=1))
n = self.num_cols
f1, p1, f2, p2, f3, p3 = torch.split(
y_initial_estimate, [n, n, n, n, n, n], dim=1
)
claim_size_dev = p1 / torch.sum(f1)
mult = self.multiplier(pooled_output)
return torch.sum(torch.exp(mult) * claim_size_dev, dim=1)
EPOCH_COUNT = 3
net = BertForTextClaimsModel().to(device)
criterion = nn.MSELoss()
optimizer = BertAdam(
net.multiplier.parameters(),
t_total=np.ceil(df_text_train.shape[0] / 32) * EPOCH_COUNT,
lr=0.001,
)
net.train()
for epoch in range(EPOCH_COUNT): # loop over the dataset n times
running_loss = 0.0
running_output = 0.0
running_y = 0.0
for i, dataitem in enumerate(ds_dataloader_train, 0):
input_ids, w, X, y = dataitem
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(input_ids, w, X)
# Check no zero
if torch.sum(outputs).data.cpu().numpy() <= 0.00001:
print("Error")
print(outputs_last)
print(y_last)
print(outputs)
print(y)
break
outputs_last = outputs
y_last = y
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
# print statistics
# print(i, net.multiplier.bias)
if np.isnan(loss.item()):
print("loss: NaN")
break
running_output += torch.sum(outputs).data.cpu().numpy()
running_y += torch.sum(y).data.cpu().numpy()
running_loss += loss.item()
if i % 20 == 19: # print every 20 mini-batches
print(
"[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000),
net.multiplier.bias.data.cpu().numpy(),
running_output,
running_y,
)
running_output = 0.0
running_loss = 0.0
running_y = 0.0
print("Finished Training")
# Predict BERT-enhanced on holdout data
bert_array = np.empty((0, num_cols))
for i, dataitem in enumerate(ds_dataloader_test, 0):
input_ids, w, X, y = dataitem
# forward
outputs = net(input_ids, w, X)
bert_array = np.append(bert_array, outputs.data.cpu().numpy())
# Benchmark
summary = (
ds_desc_test.claim_description.reset_index()
.assign(
standard=punppci_array[:, num_cols : (num_cols * 2)].sum(axis=1)
/ punppci_array[:, 0:num_cols].sum(axis=1),
bert=bert_array,
paid=ds_desc_test.claim_paid.sum(axis=1).reset_index(drop=True),
)
.drop(columns="index")
)
summary
summary.mean()
summary.median()
df_test[claim_paid_names].mean().sum() / df_test[claim_count_names].mean().sum()
# Compare sum of squares error
np.sum((summary.standard.values - summary.paid.values) ** 2)
np.sum((summary.bert.values - summary.paid.values) ** 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment