Skip to content

Instantly share code, notes, and snippets.

View bhavsarpratik's full-sized avatar

Pratik Bhavsar bhavsarpratik

View GitHub Profile
@bhavsarpratik
bhavsarpratik / imbalance.py
Created August 28, 2019 18:25
Imbalance methods
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import class_weight
def get_class_weights(y, one_hot=False):
"""Returns a dict of class weights for label encoded as well as one-hot encoded y."""
if one_hot:
#Earlier
language_model.get_sentence_embedding(sentence)
#Later
from cachetools import LRUCache, cached
@cached(cache=LRUCache(maxsize=10000))
def get_sentence_embedding(sentence):
return language_model.get_sentence_embedding(sentence)
df = pd.read_csv('quora-question-pairs/train.csv')
df = df.sample(frac=0.01, random_state=1)
df.dropna(inplace=True)
questions = df.question1.values
class TFEncoder(metaclass=ABCMeta):
"""Base encoder to be used for all encoders."""
def __init__(self, model_path:str):
self.model = hub.load(model_path)
@abstractmethod
def encode(self, text:list):
"""Encodes text.
Text: should be a list of strings to encode
"""
class FAISS:
def __init__(self, dimensions:int):
self.dimensions = dimensions
self.index = faiss.IndexFlatL2(dimensions)
self.vectors = {}
self.counter = 0
def add(self, text:str, v:list):
self.index.add(v)
self.vectors[self.counter] = (text, v)
d = encoder.encode(['hello']).shape[-1] # get dimension of emb
index = FAISS(d)
#index all questions
for q in tqdm(questions):
emb = encoder.encode([q])
index.add(q, emb)
# embed and search a question
def search(s, k=10):
# From fastai
# https://github.com/fastai/fastai/blob/450bdd1de7ecb532d94e35590275d0f1d5ebb3f0/fastai/text/learner.py#L28
def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:
"Convert the model `wgts` to go with a new vocabulary."
dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']
wgts_m = enc_wgts.mean(0)
if dec_bias is not None: bias_m = dec_bias.mean(0)
new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()
if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()
import pandas as pd
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000):
"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model"""
PATH = Path('/tmp/lm_data')
PATH.mkdir(exist_ok=True)
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
from fastprogress.fastprogress import master_bar, progress_bar
from time import sleep
mb = master_bar(range(10))
for i in mb:
for j in progress_bar(range(100), parent=mb):
sleep(0.01)
mb.child.comment = f'second bar stat'
mb.first_bar.comment = f'first bar stat'
mb.write(f'Finished loop {i}.')