Skip to content

Instantly share code, notes, and snippets.

@infinex
Last active March 23, 2021 15:31
Show Gist options
  • Save infinex/b456a6e19bd4cbb06e1b9ffad5d2dc04 to your computer and use it in GitHub Desktop.
Save infinex/b456a6e19bd4cbb06e1b9ffad5d2dc04 to your computer and use it in GitHub Desktop.
ranking
import torch
from transformers import TFAutoModel, AutoTokenizer,AutoModel
import os
import tensorflow as tf
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tf_model = TFAutoModel.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2',from_pt=True)
model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2')
texts = ['I love to sleep','Sleeping is me']
dataset = tokenizer.batch_encode_plus(texts,padding='longest')
tf_dataset = tokenizer.batch_encode_plus(texts,padding='max_length',return_tensors='tf',max_length=128)
tf_iter = tf.data.Dataset.from_tensor_slices(tf_dataset).batch(32)
for batch in tf_iter:
print(tf_model(batch))
class ModelFn(tf.Module):
def __init__(self, model):
self.model = model
self.max_seq_length = 512
@tf.function(input_signature=[
tf.TensorSpec(shape=(None, 128), dtype=tf.int64),
tf.TensorSpec(shape=(None, 128), dtype=tf.int64),
])
def predict_export(self, input_ids, attention_mask):
results = self.model(
{'input_ids': input_ids, 'attention_mask': attention_mask},
training=False).pooler_output
return results
@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)
])
def predict_export_serialised(self, serialized):
input_features = {
'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
'attention_mask': tf.io.FixedLenFeature(
[self.max_seq_length], tf.int64),
}
example = tf.io.parse_example(serialized=serialized, features=input_features)
# convert to list input as per define in keras model
results = self.model(example,training=False).pooler_output
return results
def export_save_model(self, export_dir):
export_dir = "%s/1/" % export_dir
tf.saved_model.save(self.model, export_dir,
{'predict_b64': self.predict_export_serialised,
'predict': self.predict_export})
ModelFn(tf_model).export_save_model('model')
class TransformerDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings['input_ids'])
ds = TransformerDataset(dataset)
trainloader=torch.utils.data.DataLoader(ds, batch_size=32, shuffle=False, num_workers=8)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
for batch in trainloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
print(outputs.pooler_output)
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
passages = [['7744105', 'For Earth-centered it was Geocentric Theory proposed by greeks under the guidance of Ptolemy and Sun-centered was Heliocentric theory proposed by Nicolas Copernicus in 16th century A.D. In short, Your Answers are: 1st blank - Geo-Centric Theory. 2nd blank - Heliocentric Theory.'], ['2593796', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.he geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.'], ['6217200', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['3276925', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['6217208', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['4280557', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.imple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect. You might want to check out one article on the history of the geocentric model and one regarding the geocentric theory.'], ['264181', 'Nicolaus Copernicus (b. 1473â\x80\x93d. 1543) was the first modern author to propose a heliocentric theory of the universe. From the time that Ptolemy of Alexandria (c. 150 CE) constructed a mathematically competent version of geocentric astronomy to Copernicusâ\x80\x99s mature heliocentric version (1543), experts knew that the Ptolemaic system diverged from the geocentric concentric-sphere conception of Aristotle.'], ['4280558', 'A Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth. Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth.'], ['3276926', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 91 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['5183032', "After 1,400 years, Copernicus was the first to propose a theory which differed from Ptolemy's geocentric system, according to which the earth is at rest in the center with the rest of the planets revolving around it."]]
query = 'who proposed the geocentric theory'
pattern = "Query: {query} Document: {document} Relevant:"
model = 'castorini/monot5-base-msmarco'
texts = [p[1] for p in passages]
model = T5ForConditionalGeneration.from_pretrained(model)
tokenizer = AutoTokenizer.from_pretrained('t5-base', use_fast=False)
EOS = tokenizer.eos_token
MAX_LENGTH= 512
t4_tokenizer = {'return_attention_mask': True,
'padding': 'longest',
'truncation': True,
'max_length': 512}
dataset = tokenizer.batch_encode_plus(
[pattern.format(query=query, document=document) for
document in
texts],**t4_tokenizer)
@torch.no_grad()
def greedy_decode(model,
input_ids: torch.Tensor,
length: int,
attention_mask: torch.Tensor = None,
return_last_logits: bool = True):
decode_ids = torch.full((input_ids.size(0), 1),
model.config.decoder_start_token_id,
dtype=torch.long).to(input_ids.device)
encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask)
next_token_logits = None
for _ in range(length):
model_inputs = model.prepare_inputs_for_generation(
decode_ids,
encoder_outputs=encoder_outputs,
past=None,
attention_mask=attention_mask,
use_cache=True)
outputs = model(**model_inputs) # (batch_size, cur_len, vocab_size)
next_token_logits = outputs[0][:, -1, :] # (batch_size, vocab_size)
decode_ids = torch.cat([decode_ids,
next_token_logits.max(1)[1].unsqueeze(-1)],
dim=-1)
if return_last_logits:
return decode_ids, next_token_logits
return decode_ids
import torch
class MonoT5Dataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings['input_ids'])
ds = MonoT5Dataset(dataset)
trainloader=torch.utils.data.DataLoader(ds, batch_size=32, shuffle=False, num_workers=8)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
for batch in trainloader:
input_ids = batch['input_ids'].to(device)
attn_mask = batch['attention_mask'].to(device)
_, batch_scores = greedy_decode(model,
input_ids,
length=1,
attention_mask=attn_mask,
return_last_logits=True)
# 6136 and 1176 are the indexes of the tokens false and true in T6.
batch_scores = batch_scores[:, [6136, 1176]]
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
batch_log_probs = batch_scores[:, 1].tolist()
for score in batch_log_probs:
print(score)
"""
This examples demonstrates the setup for Query / Question-Answer-Retrieval.
You can input a query or a question. The script then uses semantic search
to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM).
For semantic search, we use SentenceTransformer('msmarco-distilbert-base-v2') and retrieve
100 potentially passages that answer the input query.
Next, we use a more powerful CrossEncoder (cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')) that
scores the query and all retrieved passages for their relevancy. The cross-encoder is neccessary to filter out certain noise
that might be retrieved from the semantic search step.
Google Colab Example: https://colab.research.google.com/drive/1l6stpYdRMmeDBK_vw0L5NitdiAuhdsAr?usp=sharing
"""
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import time
import gzip
import os
import torch
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'msmarco-distilbert-base-v2'
bi_encoder = SentenceTransformer(model_name)
top_k = 100 #Number of passages we want to retrieve with the bi-encoder
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'
if not os.path.exists(wikipedia_filepath):
util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)
passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
for line in fIn:
data = json.loads(line.strip())
passages.extend(data['paragraphs'])
#If you like, you can also limit the number of passages you want to use
print("Passages:", len(passages))
# To speed things up, pre-computed embeddings are downloaded.
# The provided file encoded the passages with the model 'msmarco-distilbert-base-v2'
embeddings_filepath = f'{Path(wikipedia_filepath).stem}-{model_name}.pt'
if os.path.exists(embeddings_filepath):
corpus_embeddings = torch.load(embeddings_filepath)
corpus_embeddings = corpus_embeddings.float() #Convert embedding file to float
if torch.cuda.is_available():
corpus_embeddings = corpus_embeddings.to('cuda')
else: #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
torch.save(corpus_embeddings,Path(embeddings_filepath))
while True:
query = input("Please enter a question: ")
#Encode the query using the bi-encoder and find potentially relevant passages
start_time = time.time()
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
#Now, score all retrieved passages with the cross_encoder
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)
#Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):
hits[idx]['cross-score'] = cross_scores[idx]
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
end_time = time.time()
#Output of top-5 hits
print("Input question:", query)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']]))
print("\n\n========\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment