Skip to content

Instantly share code, notes, and snippets.

View thomwolf's full-sized avatar
🚂
training

Thomas Wolf thomwolf

🚂
training
View GitHub Profile
@thomwolf
thomwolf / read_checkpoint.py
Created August 8, 2019 17:15
Read a TensorFlow checkpoint
import os
from pprint import pprint
import tensorflow as tf
tf_path = os.path.abspath('./models/117M/model.ckpt') # Path to our TensorFlow checkpoint
tf_vars = tf.train.list_variables(tf_path)
pprint(tf_vars)
@thomwolf
thomwolf / get_gpt_2.sh
Created August 8, 2019 16:56
Retrieve OpenAI GPT-2 model
git clone https://github.com/openai/gpt-2.git
cd gpt-2
python download_model.py 117M
@thomwolf
thomwolf / gpt-2-wikitext-103.py
Last active April 16, 2024 19:27
A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103
# Copyright (c) 2019-present, Thomas Wolf.
# All rights reserved. This source code is licensed under the MIT-style license.
""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
import os
from collections import namedtuple
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from ignite.engine import Engine, Events
@thomwolf
thomwolf / top-k-top-p.py
Last active January 2, 2024 07:43
Sample the next token from a probability distribution using top-k and/or nucleus (top-p) sampling
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (vocabulary size)
top_k >0: keep only top k tokens with highest probability (top-k filtering).
top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
"""
assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
top_k = min(top_k, logits.size(-1)) # Safety check
@thomwolf
thomwolf / persona-chat.py
Last active February 28, 2023 07:25
Download and load persona-chat json dataset
import json
from pytorch_pretrained_bert import cached_path
url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
dataset = json.loads(f.read())
@thomwolf
thomwolf / compute_losses.py
Last active May 3, 2019 09:08
Compute multi-task loss
# Forward pass
lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids)
# Total loss as a weighted sum
lm_coef = 2.0
mc_coef = 1.0
total_loss = lm_loss * lm_coef + mc_loss * mc_coef
@thomwolf
thomwolf / losses.py
Last active May 21, 2019 08:41
Multi-task losses computation
import torch
# Let's add a distractor to our previously defined persona, history and reply
distractor = ["sorry", "to", "hear", "that"]
# Build & tokenize inputs ending with our distractor like we did with the gold reply
words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
words_distractor = tokenizer.convert_tokens_to_ids(words_distractor)
segments_distractor = tokenizer.convert_tokens_to_ids(segments_distractor)
@thomwolf
thomwolf / add_special_tokens.py
Last active February 5, 2023 03:09
Add special tokens to our model
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
# We can add these special tokens to the vocabulary and the embeddings of the model:
tokenizer.set_special_tokens(SPECIAL_TOKENS)
@thomwolf
thomwolf / build_inputs.py
Last active July 18, 2019 02:39
Build the inputs of the model
from itertools import chain
# Let's define our contexts and special tokens
persona = [["i", "like", "playing", "football", "."],
["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"
@thomwolf
thomwolf / create_model_tokenizer.py
Last active May 3, 2019 08:55
Instantiate OpenAI GPT model and tokenizer from pretrained checkpoint
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')