Last active
June 30, 2023 05:28
-
-
Save keitazoumana/4c2bafe324481af4fd83af945912024e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Useful libraries | |
import numpy as np | |
import torch | |
from keras.preprocessing.sequence import pad_sequences | |
from transformers import BertTokenizer, AutoModelForSequenceClassification | |
# Load bert model | |
model_path = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(model_path, | |
do_lower_case=True) | |
model = AutoModelForSequenceClassification.from_pretrained(model_path, | |
output_attentions=False, | |
output_hidden_states=True) | |
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510): | |
input_ids = tokenizer.encode( | |
text, | |
add_special_tokens = True, | |
max_length = MAX_LEN, | |
) | |
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", | |
truncating="post", padding="post") | |
# Remove the outer list. | |
input_ids = results[0] | |
# Create attention masks | |
attention_mask = [int(i>0) for i in input_ids] | |
# Convert to tensors. | |
input_ids = torch.tensor(input_ids) | |
attention_mask = torch.tensor(attention_mask) | |
# Add an extra dimension for the "batch" (even though there is only one | |
# input in this batch.) | |
input_ids = input_ids.unsqueeze(0) | |
attention_mask = attention_mask.unsqueeze(0) | |
# Put the model in "evaluation" mode, meaning feed-forward operation. | |
model.eval() | |
# Run the text through BERT, and collect all of the hidden states produced | |
# from all 12 layers. | |
with torch.no_grad(): | |
logits, encoded_layers = model( | |
input_ids = input_ids, | |
token_type_ids = None, | |
attention_mask = attention_mask, | |
return_dict=False) | |
layer_i = 12 # The last BERT layer before the classifier. | |
batch_i = 0 # Only one input in the batch. | |
token_i = 0 # The first token, corresponding to [CLS] | |
# Extract the vector. | |
vector = encoded_layers[layer_i][batch_i][token_i] | |
# Move to the CPU and convert to numpy ndarray. | |
vector = vector.detach().cpu().numpy() | |
return(vector) | |
def create_vector_database(data): | |
# The list of all the vectors | |
vectors = [] | |
# Get overall text data | |
source_data = data.abstract.values | |
# Loop over all the comment and get the embeddings | |
for text in tqdm(source_data): | |
# Get the embedding | |
vector = create_vector_from_text(tokenizer, model, text) | |
#add it to the list | |
vectors.append(vector) | |
data["vectors"] = vectors | |
data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb)) | |
data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1)) | |
return data | |
# Create the vector database | |
vector_database = create_vector_database(source_data) | |
vector_database.sample(5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment