Skip to content

Instantly share code, notes, and snippets.

View bhavsarpratik's full-sized avatar

Pratik Bhavsar bhavsarpratik

View GitHub Profile
#Earlier
language_model.get_sentence_embedding(sentence)
#Later
from cachetools import LRUCache, cached
@cached(cache=LRUCache(maxsize=10000))
def get_sentence_embedding(sentence):
return language_model.get_sentence_embedding(sentence)
df = pd.read_csv('quora-question-pairs/train.csv')
df = df.sample(frac=0.01, random_state=1)
df.dropna(inplace=True)
questions = df.question1.values
class FAISS:
def __init__(self, dimensions:int):
self.dimensions = dimensions
self.index = faiss.IndexFlatL2(dimensions)
self.vectors = {}
self.counter = 0
def add(self, text:str, v:list):
self.index.add(v)
self.vectors[self.counter] = (text, v)
d = encoder.encode(['hello']).shape[-1] # get dimension of emb
index = FAISS(d)
#index all questions
for q in tqdm(questions):
emb = encoder.encode([q])
index.add(q, emb)
# embed and search a question
def search(s, k=10):
class TFEncoder(metaclass=ABCMeta):
"""Base encoder to be used for all encoders."""
def __init__(self, model_path:str):
self.model = hub.load(model_path)
@abstractmethod
def encode(self, text:list):
"""Encodes text.
Text: should be a list of strings to encode
"""
# From fastai
# https://github.com/fastai/fastai/blob/450bdd1de7ecb532d94e35590275d0f1d5ebb3f0/fastai/text/learner.py#L28
def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:
"Convert the model `wgts` to go with a new vocabulary."
dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']
wgts_m = enc_wgts.mean(0)
if dec_bias is not None: bias_m = dec_bias.mean(0)
new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()
if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()
import pandas as pd
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000):
"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model"""
PATH = Path('/tmp/lm_data')
PATH.mkdir(exist_ok=True)
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
import json
import os
from sklearn.metrics import (accuracy_score, classification_report,
confusion_matrix, f1_score, fbeta_score)
def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None):
if y_encoder:
y = y_encoder.inverse_transform(y)
y_pred = y_encoder.inverse_transform(y_pred)
from tqdm import tqdm
import time
tqdm.pandas()
df['col'] = df['col'].progress_apply(lambda x: x**2)
text = ""
for char in tqdm(["a", "b", "c", "d"]):
time.sleep(0.25)