emillykkejensen/MultiLabel_MultiClass_TextClassification_with_BERT_Transformer_and_Keras.py

## MultiLabel_MultiClass_TextClassification_with_BERT_Transformer_and_Keras.py
#######################################
### -------- Load libraries ------- ###

# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split


#######################################
### --------- Import data --------- ###

# Import data from csv
data = pd.read_csv('dev/Fun with BERT/complaints.csv')

# Select required columns
data = data[['Consumer complaint narrative', 'Product', 'Issue']]

# Remove a row if any of the three remaining columns are missing
data = data.dropna()

# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)

# Set your model output as categorical and save in new label col
data['Issue_label'] = pd.Categorical(data['Issue'])
data['Product_label'] = pd.Categorical(data['Product'])

# Transform your output to numeric
data['Issue'] = data['Issue_label'].cat.codes
data['Product'] = data['Product_label'].cat.codes

# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])


#######################################
### --------- Setup BERT ---------- ###

# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 100

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)


#######################################
### ------- Build the model ------- ###

# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

# Load the MainLayer
bert = transformer_model.layers[0]

# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
inputs = {'input_ids': input_ids}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
outputs = {'issue': issue, 'product': product}

# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

# Take a look at the model
model.summary()


#######################################
### ------- Train the model ------- ###

# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss,
    metrics = metric)

# Ready output data for the model
y_issue = to_categorical(data['Issue'])
y_product = to_categorical(data['Product'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Fit the model
history = model.fit(
    # x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=10)


#######################################
### ----- Evaluate the model ------ ###

# Ready test data
test_y_issue = to_categorical(data_test['Issue'])
test_y_product = to_categorical(data_test['Product'])
test_x = tokenizer(
    text=data_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'issue': test_y_issue, 'product': test_y_product}
)
	#######################################
	### -------- Load libraries ------- ###

	# Load Huggingface transformers
	from transformers import TFBertModel, BertConfig, BertTokenizerFast

	# Then what you need from tensorflow.keras
	from tensorflow.keras.layers import Input, Dropout, Dense
	from tensorflow.keras.models import Model
	from tensorflow.keras.optimizers import Adam
	from tensorflow.keras.callbacks import EarlyStopping
	from tensorflow.keras.initializers import TruncatedNormal
	from tensorflow.keras.losses import CategoricalCrossentropy
	from tensorflow.keras.metrics import CategoricalAccuracy
	from tensorflow.keras.utils import to_categorical

	# And pandas for data import + sklearn because you allways need sklearn
	import pandas as pd
	from sklearn.model_selection import train_test_split


	#######################################
	### --------- Import data --------- ###

	# Import data from csv
	data = pd.read_csv('dev/Fun with BERT/complaints.csv')

	# Select required columns
	data = data[['Consumer complaint narrative', 'Product', 'Issue']]

	# Remove a row if any of the three remaining columns are missing
	data = data.dropna()

	# Remove rows, where the label is present only ones (can't be split)
	data = data.groupby('Issue').filter(lambda x : len(x) > 1)
	data = data.groupby('Product').filter(lambda x : len(x) > 1)

	# Set your model output as categorical and save in new label col
	data['Issue_label'] = pd.Categorical(data['Issue'])
	data['Product_label'] = pd.Categorical(data['Product'])

	# Transform your output to numeric
	data['Issue'] = data['Issue_label'].cat.codes
	data['Product'] = data['Product_label'].cat.codes

	# Split into train and test - stratify over Issue
	data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])


	#######################################
	### --------- Setup BERT ---------- ###

	# Name of the BERT model to use
	model_name = 'bert-base-uncased'

	# Max length of tokens
	max_length = 100

	# Load transformers config and set output_hidden_states to False
	config = BertConfig.from_pretrained(model_name)
	config.output_hidden_states = False

	# Load BERT tokenizer
	tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

	# Load the Transformers BERT model
	transformer_model = TFBertModel.from_pretrained(model_name, config = config)


	#######################################
	### ------- Build the model ------- ###

	# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

	# Load the MainLayer
	bert = transformer_model.layers[0]

	# Build your model input
	input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
	# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
	# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
	inputs = {'input_ids': input_ids}

	# Load the Transformers BERT model as a layer in a Keras model
	bert_model = bert(inputs)[1]
	dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
	pooled_output = dropout(bert_model, training=False)

	# Then build your model output
	issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
	product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
	outputs = {'issue': issue, 'product': product}

	# And combine it all in a model object
	model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

	# Take a look at the model
	model.summary()


	#######################################
	### ------- Train the model ------- ###

	# Set an optimizer
	optimizer = Adam(
	learning_rate=5e-05,
	epsilon=1e-08,
	decay=0.01,
	clipnorm=1.0)

	# Set loss and metrics
	loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
	metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

	# Compile the model
	model.compile(
	optimizer = optimizer,
	loss = loss,
	metrics = metric)

	# Ready output data for the model
	y_issue = to_categorical(data['Issue'])
	y_product = to_categorical(data['Product'])

	# Tokenize the input (takes some time)
	x = tokenizer(
	text=data['Consumer complaint narrative'].to_list(),
	add_special_tokens=True,
	max_length=max_length,
	truncation=True,
	padding=True,
	return_tensors='tf',
	return_token_type_ids = False,
	return_attention_mask = True,
	verbose = True)

	# Fit the model
	history = model.fit(
	# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
	x={'input_ids': x['input_ids']},
	y={'issue': y_issue, 'product': y_product},
	validation_split=0.2,
	batch_size=64,
	epochs=10)


	#######################################
	### ----- Evaluate the model ------ ###

	# Ready test data
	test_y_issue = to_categorical(data_test['Issue'])
	test_y_product = to_categorical(data_test['Product'])
	test_x = tokenizer(
	text=data_test['Consumer complaint narrative'].to_list(),
	add_special_tokens=True,
	max_length=max_length,
	truncation=True,
	padding=True,
	return_tensors='tf',
	return_token_type_ids = False,
	return_attention_mask = False,
	verbose = True)

	# Run evaluation
	model_eval = model.evaluate(
	x={'input_ids': test_x['input_ids']},
	y={'issue': test_y_issue, 'product': test_y_product}
	)