-
-
Save JackyP/99141e403df720a2e752b9bbf08e428c to your computer and use it in GitHub Desktop.
Use BERT to extend the model and apply to claim incident descriptions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# ! [ ! -z "$COLAB_GPU" ] && pip install punppci==0.0.8 pytorch_pretrained_bert spacy ftfy==4.4.3 | |
import pandas as pd | |
import numpy as np | |
import punppci as pn | |
import torch as torch | |
import torch.optim as optim | |
from torch.utils import data | |
from torch import nn | |
from pandas.tseries.offsets import MonthEnd | |
from pytorch_pretrained_bert import ( | |
BertTokenizer, | |
BertModel, | |
BertAdam, | |
) # pip install pytorch_pretrained_bert spacy ftfy==4.4.3 | |
from pytorch_pretrained_bert import BertConfig | |
config = BertConfig( | |
vocab_size_or_config_json_file=32000, | |
hidden_size=768, | |
num_hidden_layers=12, | |
num_attention_heads=12, | |
intermediate_size=3072, | |
) | |
# check if CUDA is available | |
train_on_gpu = torch.cuda.is_available() | |
if train_on_gpu: | |
print("CUDA is available! Training on GPU ...") | |
device = torch.device("cuda:0") | |
device_skorch = "cuda" | |
else: | |
print("CUDA is not available. Training on CPU ...") | |
device = torch.device("cpu") | |
device_skorch = "cpu" | |
torch.cuda.get_device_name(0) | |
num_cols = 24 | |
# Get example data frame | |
df1 = pd.read_csv("2015-claims-desc.csv") | |
df2 = pd.read_csv("2016-claims-desc.csv") | |
df_all = pd.concat([df1, df2], axis=0).fillna(0).reset_index(drop=True) | |
paid_cols = df_all.columns[df_all.columns.str.contains("PAID")] | |
# Cap large claims | |
df_all[paid_cols].apply(lambda x: np.minimum(x, 50000)) | |
df_all.rename(columns=dict(zip(paid_cols, "CLAIM_" + paid_cols)), inplace=True) | |
# TODO: Origin date shouldn't require naming as such | |
df_all["origin_date"] = pd.to_datetime(df_all["ISSUE_DATE"]) + MonthEnd(0) | |
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
as_at = df_all.origin_date.max() | |
# TODO - remove dependency on column numbering on results | |
claim_count_names1 = ["CLAIM_COUNT_{}".format(x) for x in range(1, 1 + num_cols)] | |
claim_paid_names1 = ["CLAIM_PAID_AMOUNT_{}".format(x) for x in range(1, 1 + num_cols)] | |
claim_count_names = ["CLAIM_COUNT_{}".format(x) for x in range(0, num_cols)] | |
claim_paid_names = ["CLAIM_PAID_AMOUNT_{}".format(x) for x in range(0, num_cols)] | |
df_all.rename(columns=dict(zip(claim_count_names1, claim_count_names)), inplace=True) | |
df_all.rename(columns=dict(zip(claim_paid_names1, claim_paid_names)), inplace=True) | |
features = ["PLAN", "DEST", "COVER", "AGE", "LEAD_TIME", "TRIP_DURATION"] | |
origin = "origin_date" | |
# train-test split | |
maskf = np.random.rand(len(df_all)) < 0.7 | |
df = df_all[maskf] | |
df_test = df_all[~maskf] | |
# Dataset | |
ds = pn.InsuranceDataset( | |
features=df[features], | |
origin=df[origin], | |
claim_count=df[claim_count_names].clip(lower=0), | |
claim_paid=df[claim_paid_names].clip(lower=0), | |
as_at_date=as_at, | |
) | |
ds_test = pn.InsuranceDataset( | |
features=df_test[features], | |
origin=df_test[origin], | |
claim_count=df_test[claim_count_names].clip(lower=0), | |
claim_paid=df_test[claim_paid_names].clip(lower=0), | |
as_at_date=None, | |
) | |
# Model - Make, Fit and Predict | |
model0 = pn.PUNPPCIClaimRegressor(device=device_skorch) | |
model1 = pn.PPCIRegressor() | |
model0.device | |
model0.fit(ds.X(model0), ds.y(model0), w=ds.w(model0)) | |
model1.fit(ds.X(model1), ds.y(model1), w=ds.w(model1)) | |
model0.device | |
MAX_SEQ_LENGTH = 256 | |
class ClaimDescriptionDataset(pn.InsuranceDataset): | |
def __init__( | |
self, | |
claim_description, | |
Model, | |
features=None, | |
origin=None, | |
exposure=None, | |
claim_count=None, | |
claim_paid=None, | |
claim_incurred=None, | |
as_at_date=None, | |
period_type="months", | |
): | |
super(ClaimDescriptionDataset, self).__init__( | |
features=features, | |
origin=origin, | |
exposure=exposure, | |
claim_count=claim_count, | |
claim_paid=claim_paid, | |
claim_incurred=claim_incurred, | |
as_at_date=as_at_date, | |
period_type=period_type, | |
) | |
self.claim_description = claim_description | |
self.Model = Model | |
def __getitem__(self, index): | |
tokenized_description = tokenizer.tokenize(self.claim_description.iloc[index]) | |
if len(tokenized_description) > MAX_SEQ_LENGTH: | |
tokenized_description = tokenized_description[:MAX_SEQ_LENGTH] | |
ids_description = tokenizer.convert_tokens_to_ids(tokenized_description) | |
padding = [0] * (MAX_SEQ_LENGTH - len(ids_description)) | |
ids_description += padding | |
assert len(ids_description) == MAX_SEQ_LENGTH | |
ids_description = torch.tensor(ids_description) | |
X = self.X(self.Model)[index].astype(np.float32) | |
y = self.y(self.Model)[index, range(num_cols, num_cols * 2)].astype(np.float32) | |
w = np.where(np.isnan(y), 0.0, 1.0).astype(np.float32) | |
y = np.where(np.isnan(y), 0.0, y).astype(np.float32).sum(axis=0, keepdims=True) | |
return ( | |
ids_description.to(device), | |
torch.from_numpy(w).to(device), | |
torch.from_numpy(X).to(device), | |
torch.from_numpy(y).to(device), | |
) | |
def __len__(self): | |
return len(self.features) | |
# Train | |
# BERT_SIZE=20000 | |
df_text_train = df.loc[(ds.claim_count.sum(axis=1) > 0).values] | |
ds_desc_train = ClaimDescriptionDataset( | |
claim_description=df_text_train["INCIDENTDESCRIPTION"], | |
Model=model0, | |
features=df_text_train[features], | |
origin=df_text_train[origin], | |
claim_count=df_text_train[claim_count_names].clip(lower=0), | |
claim_paid=df_text_train[claim_paid_names].clip(lower=0), | |
as_at_date=as_at, | |
) | |
# Test | |
df_text_test = df_test.loc[(ds_test.claim_count.sum(axis=1) > 0).values] | |
ds_desc_test = ClaimDescriptionDataset( | |
claim_description=df_text_test["INCIDENTDESCRIPTION"], | |
Model=model0, | |
features=df_text_test[features], | |
origin=df_text_test[origin], | |
claim_count=df_text_test[claim_count_names].clip(lower=0), | |
claim_paid=df_text_test[claim_paid_names].clip(lower=0), | |
as_at_date=None, | |
) | |
ds_desc_test[20] | |
ds_dataloader_train = data.DataLoader(ds_desc_train, batch_size=32, shuffle=True) | |
ds_dataloader_test = data.DataLoader(ds_desc_test, batch_size=32, shuffle=False) | |
model0.module_.eval() | |
# Predict standard model on holdout data | |
punppci_array = np.empty((0, num_cols * 6)) | |
for i, dataitem in enumerate(ds_dataloader_test, 0): | |
input_ids, w, X, y = dataitem | |
# forward | |
outputs = model0.module_(torch.cat([X, w, w], dim=1)) | |
punppci_array = np.append(punppci_array, outputs.data.cpu().numpy(), axis=0) | |
punppci_array | |
class BertForTextClaimsModel(nn.Module): | |
def __init__( | |
self, PUNPPCIModel=model0.module_, num_cols=num_cols # trained from above | |
): | |
super(BertForTextClaimsModel, self).__init__() | |
self.bert = BertModel.from_pretrained("bert-base-uncased") | |
self.punppci = PUNPPCIModel | |
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |
self.multiplier = nn.Linear(config.hidden_size, 1) | |
self.num_cols = num_cols | |
nn.init.xavier_normal_(self.multiplier.weight) | |
def forward( | |
self, input_ids, w, X, token_type_ids=None, attention_mask=None, labels=None | |
): | |
with torch.no_grad(): | |
_, pooled_output = self.bert( | |
input_ids, | |
token_type_ids, | |
attention_mask, | |
output_all_encoded_layers=False, | |
) | |
pooled_output = self.dropout(pooled_output) | |
y_initial_estimate = self.punppci(torch.cat([X, w, w], dim=1)) | |
n = self.num_cols | |
f1, p1, f2, p2, f3, p3 = torch.split( | |
y_initial_estimate, [n, n, n, n, n, n], dim=1 | |
) | |
claim_size_dev = p1 / torch.sum(f1) | |
mult = self.multiplier(pooled_output) | |
return torch.sum(torch.exp(mult) * claim_size_dev, dim=1) | |
EPOCH_COUNT = 3 | |
net = BertForTextClaimsModel().to(device) | |
criterion = nn.MSELoss() | |
optimizer = BertAdam( | |
net.multiplier.parameters(), | |
t_total=np.ceil(df_text_train.shape[0] / 32) * EPOCH_COUNT, | |
lr=0.001, | |
) | |
net.train() | |
for epoch in range(EPOCH_COUNT): # loop over the dataset n times | |
running_loss = 0.0 | |
running_output = 0.0 | |
running_y = 0.0 | |
for i, dataitem in enumerate(ds_dataloader_train, 0): | |
input_ids, w, X, y = dataitem | |
# zero the parameter gradients | |
optimizer.zero_grad() | |
# forward + backward + optimize | |
outputs = net(input_ids, w, X) | |
# Check no zero | |
if torch.sum(outputs).data.cpu().numpy() <= 0.00001: | |
print("Error") | |
print(outputs_last) | |
print(y_last) | |
print(outputs) | |
print(y) | |
break | |
outputs_last = outputs | |
y_last = y | |
loss = criterion(outputs, y) | |
loss.backward() | |
optimizer.step() | |
# print statistics | |
# print(i, net.multiplier.bias) | |
if np.isnan(loss.item()): | |
print("loss: NaN") | |
break | |
running_output += torch.sum(outputs).data.cpu().numpy() | |
running_y += torch.sum(y).data.cpu().numpy() | |
running_loss += loss.item() | |
if i % 20 == 19: # print every 20 mini-batches | |
print( | |
"[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000), | |
net.multiplier.bias.data.cpu().numpy(), | |
running_output, | |
running_y, | |
) | |
running_output = 0.0 | |
running_loss = 0.0 | |
running_y = 0.0 | |
print("Finished Training") | |
# Predict BERT-enhanced on holdout data | |
bert_array = np.empty((0, num_cols)) | |
for i, dataitem in enumerate(ds_dataloader_test, 0): | |
input_ids, w, X, y = dataitem | |
# forward | |
outputs = net(input_ids, w, X) | |
bert_array = np.append(bert_array, outputs.data.cpu().numpy()) | |
# Benchmark | |
summary = ( | |
ds_desc_test.claim_description.reset_index() | |
.assign( | |
standard=punppci_array[:, num_cols : (num_cols * 2)].sum(axis=1) | |
/ punppci_array[:, 0:num_cols].sum(axis=1), | |
bert=bert_array, | |
paid=ds_desc_test.claim_paid.sum(axis=1).reset_index(drop=True), | |
) | |
.drop(columns="index") | |
) | |
summary | |
summary.mean() | |
summary.median() | |
df_test[claim_paid_names].mean().sum() / df_test[claim_count_names].mean().sum() | |
# Compare sum of squares error | |
np.sum((summary.standard.values - summary.paid.values) ** 2) | |
np.sum((summary.bert.values - summary.paid.values) ** 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment