Skip to content

Instantly share code, notes, and snippets.

@keitazoumana
Last active June 30, 2023 05:28
Show Gist options
  • Save keitazoumana/4c2bafe324481af4fd83af945912024e to your computer and use it in GitHub Desktop.
Save keitazoumana/4c2bafe324481af4fd83af945912024e to your computer and use it in GitHub Desktop.
# Useful libraries
import numpy as np
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, AutoModelForSequenceClassification
# Load bert model
model_path = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path,
do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
output_attentions=False,
output_hidden_states=True)
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
input_ids = tokenizer.encode(
text,
add_special_tokens = True,
max_length = MAX_LEN,
)
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
truncating="post", padding="post")
# Remove the outer list.
input_ids = results[0]
# Create attention masks
attention_mask = [int(i>0) for i in input_ids]
# Convert to tensors.
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
# Add an extra dimension for the "batch" (even though there is only one
# input in this batch.)
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
logits, encoded_layers = model(
input_ids = input_ids,
token_type_ids = None,
attention_mask = attention_mask,
return_dict=False)
layer_i = 12 # The last BERT layer before the classifier.
batch_i = 0 # Only one input in the batch.
token_i = 0 # The first token, corresponding to [CLS]
# Extract the vector.
vector = encoded_layers[layer_i][batch_i][token_i]
# Move to the CPU and convert to numpy ndarray.
vector = vector.detach().cpu().numpy()
return(vector)
def create_vector_database(data):
# The list of all the vectors
vectors = []
# Get overall text data
source_data = data.abstract.values
# Loop over all the comment and get the embeddings
for text in tqdm(source_data):
# Get the embedding
vector = create_vector_from_text(tokenizer, model, text)
#add it to the list
vectors.append(vector)
data["vectors"] = vectors
data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))
return data
# Create the vector database
vector_database = create_vector_database(source_data)
vector_database.sample(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment