Pratik Bhavsar bhavsarpratik

## imbalance.py
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import class_weight


def get_class_weights(y, one_hot=False):
    """Returns a dict of class weights for label encoded as well as one-hot encoded y."""
    if one_hot:

## caching.py
#Earlier
language_model.get_sentence_embedding(sentence)

#Later
from cachetools import LRUCache, cached

@cached(cache=LRUCache(maxsize=10000))
def get_sentence_embedding(sentence):
    return language_model.get_sentence_embedding(sentence)

## read_data.py
df = pd.read_csv('quora-question-pairs/train.csv')
df = df.sample(frac=0.01, random_state=1)
df.dropna(inplace=True)
questions = df.question1.values

## encoders.py
class TFEncoder(metaclass=ABCMeta):
    """Base encoder to be used for all encoders."""
    def __init__(self, model_path:str):
        self.model = hub.load(model_path)

    @abstractmethod
    def encode(self, text:list):
        """Encodes text.
        Text: should be a list of strings to encode
        """

## indexer.py
class FAISS:
    def __init__(self, dimensions:int):
        self.dimensions = dimensions
        self.index = faiss.IndexFlatL2(dimensions)
        self.vectors = {}
        self.counter = 0

    def add(self, text:str, v:list):
        self.index.add(v)
        self.vectors[self.counter] = (text, v)

## index.py
d = encoder.encode(['hello']).shape[-1] # get dimension of emb
index = FAISS(d)

#index all questions
for q in tqdm(questions):
    emb = encoder.encode([q])
    index.add(q, emb)

# embed and search a question
def search(s, k=10):

## convert_weights.py
# From fastai
# https://github.com/fastai/fastai/blob/450bdd1de7ecb532d94e35590275d0f1d5ebb3f0/fastai/text/learner.py#L28

def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:
    "Convert the model `wgts` to go with a new vocabulary."
    dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']
    wgts_m = enc_wgts.mean(0)
    if dec_bias is not None: bias_m = dec_bias.mean(0)
    new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()
    if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()

## add_vocab_to_model.py
import pandas as pd
from pathlib import Path
from tokenizers import BertWordPieceTokenizer


def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000):
    """Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model"""
    PATH = Path('/tmp/lm_data')
    PATH.mkdir(exist_ok=True)

## fix_seed.py
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

## progress_bar.py
from fastprogress.fastprogress import master_bar, progress_bar
from time import sleep
mb = master_bar(range(10))
for i in mb:
    for j in progress_bar(range(100), parent=mb):
        sleep(0.01)
        mb.child.comment = f'second bar stat'
    mb.first_bar.comment = f'first bar stat'
    mb.write(f'Finished loop {i}.')
	import numpy as np
	import pandas as pd
	from imblearn.over_sampling import RandomOverSampler
	from imblearn.under_sampling import RandomUnderSampler
	from sklearn.utils import class_weight


	def get_class_weights(y, one_hot=False):
	"""Returns a dict of class weights for label encoded as well as one-hot encoded y."""
	if one_hot:
	#Earlier
	language_model.get_sentence_embedding(sentence)

	#Later
	from cachetools import LRUCache, cached

	@cached(cache=LRUCache(maxsize=10000))
	def get_sentence_embedding(sentence):
	return language_model.get_sentence_embedding(sentence)
	df = pd.read_csv('quora-question-pairs/train.csv')
	df = df.sample(frac=0.01, random_state=1)
	df.dropna(inplace=True)
	questions = df.question1.values
	class TFEncoder(metaclass=ABCMeta):
	"""Base encoder to be used for all encoders."""
	def __init__(self, model_path:str):
	self.model = hub.load(model_path)

	@abstractmethod
	def encode(self, text:list):
	"""Encodes text.
	Text: should be a list of strings to encode
	"""
	class FAISS:
	def __init__(self, dimensions:int):
	self.dimensions = dimensions
	self.index = faiss.IndexFlatL2(dimensions)
	self.vectors = {}
	self.counter = 0

	def add(self, text:str, v:list):
	self.index.add(v)
	self.vectors[self.counter] = (text, v)
	d = encoder.encode(['hello']).shape[-1] # get dimension of emb
	index = FAISS(d)

	#index all questions
	for q in tqdm(questions):
	emb = encoder.encode([q])
	index.add(q, emb)

	# embed and search a question
	def search(s, k=10):
	# From fastai
	# https://github.com/fastai/fastai/blob/450bdd1de7ecb532d94e35590275d0f1d5ebb3f0/fastai/text/learner.py#L28

	def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:
	"Convert the model `wgts` to go with a new vocabulary."
	dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']
	wgts_m = enc_wgts.mean(0)
	if dec_bias is not None: bias_m = dec_bias.mean(0)
	new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()
	if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()
	import pandas as pd
	from pathlib import Path
	from tokenizers import BertWordPieceTokenizer


	def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000):
	"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model"""
	PATH = Path('/tmp/lm_data')
	PATH.mkdir(exist_ok=True)
	def set_seed(args):
	random.seed(args.seed)
	np.random.seed(args.seed)
	torch.manual_seed(args.seed)
	if args.n_gpu > 0:
	torch.cuda.manual_seed_all(args.seed)
	from fastprogress.fastprogress import master_bar, progress_bar
	from time import sleep
	mb = master_bar(range(10))
	for i in mb:
	for j in progress_bar(range(100), parent=mb):
	sleep(0.01)
	mb.child.comment = f'second bar stat'
	mb.first_bar.comment = f'first bar stat'
	mb.write(f'Finished loop {i}.')