This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenizer(corpus, path, mode="train"): | |
""" | |
corpus : the corpus to tokenize and pad | |
path : tokenizer path | |
mode : train/test | |
""" | |
if not path.endswith(".pickle"): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TweetModel(nn.Module): | |
def __init__(self, embedding_matrix, lstm_hidden_size=200, gru_hidden_size=128): | |
super(TweetModel, self).__init__() | |
self.embedding = nn.Embedding(*embedding_matrix.shape) | |
self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32)) | |
self.embedding.weight.requires_grad = True | |
self.embedding_dropout = nn.Dropout2d(0.1) | |
self.gru = nn.GRU( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_text(text: list): | |
"""Function to preprocess and create corpus""" | |
new_corpus = [] | |
for text in text: | |
words = [w for w in text.split()] | |
new_corpus.append(words) | |
return new_corpus |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def prepare_matrix(word_index, name, dim): | |
embedding_dict = GloVe(name, dim=dim) | |
num_words = len(word_index) | |
embedding_matrix = np.zeros((num_words + 1, dim)) | |
for word, i in word_index.items(): | |
if i > num_words: | |
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch import nn as nn | |
from transformers import AutoModel, AutoTokenizer | |
base_model = AutoModel.from_pretrained('bert-base') | |
Model = MyModel(base_model) | |
def train_and_save(): | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def llrd(model,peak_lr,multiplicative factor): | |
"""sets learning rate for each layer and returns the parameters""" | |
parameters = get_model_parameters() | |
return parameters | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def reintialize(model,num_layers): | |
"""reinitialize models weights untill num_layers starting from bottom layer""" | |
return model | |
model = DefinedModel() | |
model = reinitialize(model,2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@dataclass | |
class GPTNeoxRMOuptput(ModelOutput): | |
""" | |
Reward Model Output | |
""" | |
logits: torch.FloatTensor = None | |
class GPTNeoXRM(GPTNeoXPreTrainedModel): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RMLoss(nn.Module): | |
""" """ | |
def __init__( | |
self, | |
reduction=None, | |
beta=0.001, | |
): | |
super().__init__() | |
self.reduction = reduction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WebGPT: | |
name = "openai/webgpt_comparisons" | |
def __init__(self, split: str = "train"): | |
super().__init__() | |
self.split = split | |
dataset = load_dataset(self.name, split=self.split) | |
self.dataset_dict = defaultdict(dict) | |
for item in dataset: |
OlderNewer