This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Earlier | |
language_model.get_sentence_embedding(sentence) | |
#Later | |
from cachetools import LRUCache, cached | |
@cached(cache=LRUCache(maxsize=10000)) | |
def get_sentence_embedding(sentence): | |
return language_model.get_sentence_embedding(sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv('quora-question-pairs/train.csv') | |
df = df.sample(frac=0.01, random_state=1) | |
df.dropna(inplace=True) | |
questions = df.question1.values |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FAISS: | |
def __init__(self, dimensions:int): | |
self.dimensions = dimensions | |
self.index = faiss.IndexFlatL2(dimensions) | |
self.vectors = {} | |
self.counter = 0 | |
def add(self, text:str, v:list): | |
self.index.add(v) | |
self.vectors[self.counter] = (text, v) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d = encoder.encode(['hello']).shape[-1] # get dimension of emb | |
index = FAISS(d) | |
#index all questions | |
for q in tqdm(questions): | |
emb = encoder.encode([q]) | |
index.add(q, emb) | |
# embed and search a question | |
def search(s, k=10): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TFEncoder(metaclass=ABCMeta): | |
"""Base encoder to be used for all encoders.""" | |
def __init__(self, model_path:str): | |
self.model = hub.load(model_path) | |
@abstractmethod | |
def encode(self, text:list): | |
"""Encodes text. | |
Text: should be a list of strings to encode | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From fastai | |
# https://github.com/fastai/fastai/blob/450bdd1de7ecb532d94e35590275d0f1d5ebb3f0/fastai/text/learner.py#L28 | |
def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights: | |
"Convert the model `wgts` to go with a new vocabulary." | |
dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight'] | |
wgts_m = enc_wgts.mean(0) | |
if dec_bias is not None: bias_m = dec_bias.mean(0) | |
new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_() | |
if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
from tokenizers import BertWordPieceTokenizer | |
def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000): | |
"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model""" | |
PATH = Path('/tmp/lm_data') | |
PATH.mkdir(exist_ok=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def set_seed(args): | |
random.seed(args.seed) | |
np.random.seed(args.seed) | |
torch.manual_seed(args.seed) | |
if args.n_gpu > 0: | |
torch.cuda.manual_seed_all(args.seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
from sklearn.metrics import (accuracy_score, classification_report, | |
confusion_matrix, f1_score, fbeta_score) | |
def get_metrics(y, y_pred, beta=2, average_method='macro', y_encoder=None): | |
if y_encoder: | |
y = y_encoder.inverse_transform(y) | |
y_pred = y_encoder.inverse_transform(y_pred) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tqdm import tqdm | |
import time | |
tqdm.pandas() | |
df['col'] = df['col'].progress_apply(lambda x: x**2) | |
text = "" | |
for char in tqdm(["a", "b", "c", "d"]): | |
time.sleep(0.25) |
OlderNewer