Skip to content

Instantly share code, notes, and snippets.

View shahules786's full-sized avatar
👋

Shahul ES shahules786

👋
View GitHub Profile
def tokenizer(corpus, path, mode="train"):
"""
corpus : the corpus to tokenize and pad
path : tokenizer path
mode : train/test
"""
if not path.endswith(".pickle"):
@shahules786
shahules786 / model.py
Created October 23, 2020 11:58
model
class TweetModel(nn.Module):
def __init__(self, embedding_matrix, lstm_hidden_size=200, gru_hidden_size=128):
super(TweetModel, self).__init__()
self.embedding = nn.Embedding(*embedding_matrix.shape)
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = True
self.embedding_dropout = nn.Dropout2d(0.1)
self.gru = nn.GRU(
@shahules786
shahules786 / corpus.py
Created October 23, 2020 12:01
corpus
def preprocess_text(text: list):
"""Function to preprocess and create corpus"""
new_corpus = []
for text in text:
words = [w for w in text.split()]
new_corpus.append(words)
return new_corpus
def prepare_matrix(word_index, name, dim):
embedding_dict = GloVe(name, dim=dim)
num_words = len(word_index)
embedding_matrix = np.zeros((num_words + 1, dim))
for word, i in word_index.items():
if i > num_words:
continue
from torch import nn as nn
from transformers import AutoModel, AutoTokenizer
base_model = AutoModel.from_pretrained('bert-base')
Model = MyModel(base_model)
def train_and_save():
"""
def llrd(model,peak_lr,multiplicative factor):
"""sets learning rate for each layer and returns the parameters"""
parameters = get_model_parameters()
return parameters
def reintialize(model,num_layers):
"""reinitialize models weights untill num_layers starting from bottom layer"""
return model
model = DefinedModel()
model = reinitialize(model,2)
@dataclass
class GPTNeoxRMOuptput(ModelOutput):
"""
Reward Model Output
"""
logits: torch.FloatTensor = None
class GPTNeoXRM(GPTNeoXPreTrainedModel):
class RMLoss(nn.Module):
""" """
def __init__(
self,
reduction=None,
beta=0.001,
):
super().__init__()
self.reduction = reduction
class WebGPT:
name = "openai/webgpt_comparisons"
def __init__(self, split: str = "train"):
super().__init__()
self.split = split
dataset = load_dataset(self.name, split=self.split)
self.dataset_dict = defaultdict(dict)
for item in dataset: