Skip to content

Instantly share code, notes, and snippets.

@RayWilliam46
RayWilliam46 / train_2.py
Created February 2, 2021 08:01
Unfreeze DistilBERT embedding layer and train all weights
FT_EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index)
# Unfreeze distilBERT layers and make available for training
for layer in distilBERT.layers:
layer.trainable = True
# Recompile model after unfreezing
model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5),
@RayWilliam46
RayWilliam46 / train_1.py
Created February 2, 2021 03:40
Train weights of classification head with DistilBERT layers frozen
EPOCHS = 6
BATCH_SIZE = 64
NUM_STEPS = len(X_train.index) // BATCH_SIZE
# Train the model
train_history1 = model.fit(
x = [X_train_ids, X_train_attention],
y = y_train.to_numpy(),
epochs = EPOCHS,
batch_size = BATCH_SIZE,
@RayWilliam46
RayWilliam46 / build_model.py
Last active February 1, 2021 00:30
Template for building a model off of the BERT or DistilBERT architecture
MAX_LENGTH = 128
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42
def build_model(transformer, max_length=MAX_LENGTH):
"""""""""
Template for building a model off of the BERT or DistilBERT architecture
for a binary classification task.
@RayWilliam46
RayWilliam46 / import_distilbert.py
Last active February 5, 2021 17:02
Imports the base DistilBERT architecture from the Hugging Face library
from transformers import TFDistilBertModel, DistilBertConfig
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT,
attention_dropout=DISTILBERT_ATT_DROPOUT,
output_hidden_states=True)
@RayWilliam46
RayWilliam46 / batch_encode.py
Last active September 3, 2023 14:02
Batch encodes text data using a Hugging Face tokenizer
# Define the maximum number of words to tokenize (DistilBERT can tokenize up to 512)
MAX_LENGTH = 128
# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=256, max_length=MAX_LENGTH):
"""""""""
A function that encodes a batch of texts and returns the texts'
corresponding encodings and attention masks that are ready to be fed
into a pre-trained transformer model.
@RayWilliam46
RayWilliam46 / tokenizer.py
Created January 27, 2021 06:38
Instantiate DistilBERT tokenizer
from transformers import DistilBertTokenizerFast
# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
@RayWilliam46
RayWilliam46 / augment.py
Last active February 5, 2021 21:53
Text Augmentation: Word Replacement Using BERT Contextual Embeddings
!pip install numpy requests nlpaug
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm
def augment_sentence(sentence, aug, num_threads):