Skip to content

Instantly share code, notes, and snippets.


Thomas Wolf thomwolf

View GitHub Profile
thomwolf /
Last active Dec 28, 2020
Load full English Wikipedia dataset in HuggingFace nlp library
import os; import psutil; import timeit
from datasets import load_dataset
mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
wiki = load_dataset("wikipedia", "20200501.en", split='train')
mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20
print(f"RAM memory used: {(mem_after - mem_before)} MB")
s = """batch_size = 1000
for i in range(0, len(wiki), batch_size):
thomwolf /
Last active Apr 9, 2021
Knowledge Distilation
import torch
import torch.nn as nn
from torch.optim import Optimizer
KD_loss = nn.KLDivLoss(reduction='batchmean')
def kd_step(teacher: nn.Module, student: nn.Module, temperature: float,
inputs: torch.tensor, optimizer: Optimizer):
thomwolf /
Created Aug 9, 2019
Compare the hidden-states of the TensorFlow and PyTorch models
# Get the tensorflow and pytorch hidden-states as NumPy arrays
tensorflow_hidden_states =
pytorch_hidden_states = pytorch_model(inputs)
pytorch_hidden_states = pytorch_hidden_states.cpu().detach().numpy()
# Compute the maximum absolute difference between hidden-states.
# Should be less than 1e-3. Typically around 1e-5/1e-6.
max_absolute_diff = np.amax(np.abs(tensorflow_hidden_states - pytorch_hidden_states))
thomwolf /
Last active Aug 10, 2019
Loading TensorFlow weights in a PyTorch model
import re
import numpy as np
import tensorflow as tf
model = MyPyTorchGPT2() # load the un-initialized PyTorch model we have created
# Retrieve weights from TF checkpoint
tf_path = os.path.abspath(gpt2_checkpoint_path)
init_vars = tf.train.list_variables(tf_path)
tf_vars = []
thomwolf /
Created Aug 9, 2019
Main forward pass for GPT-2
def forward(self, input_ids):
position_ids = torch.arange(0, input_ids.size(-1), dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
hidden_states = self.wte(input_ids) + self.wpe(position_ids)
hidden_states = self.drop(hidden_states)
for block in self.h:
hidden_states = block(hidden_states)
hidden_states = self.ln_f(hidden_states)
thomwolf /
Created Aug 8, 2019
GPT-2 PyTorch block module
class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__()
nx = config.n_embd
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
self.attn = Attention(nx, n_ctx, config, scale)
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
self.mlp = MLP(4 * nx, config)
def forward(self, x):
thomwolf /
Created Aug 8, 2019
GPT-2 TensorFlow block class
def block(x, scope, *, past, hparams):
with tf.variable_scope(scope):
nx = x.shape[-1].value
a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
x = x + a
m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
x = x + m
return x, present
thomwolf /
Last active Aug 9, 2019
GPT-2 main model class
class GPT2Model(nn.Module):
def __init__(self, config):
super(GPT2Model, self).__init__(config)
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
thomwolf /
Created Aug 8, 2019
Read a TensorFlow checkpoint
import os
from pprint import pprint
import tensorflow as tf
tf_path = os.path.abspath('./models/117M/model.ckpt') # Path to our TensorFlow checkpoint
tf_vars = tf.train.list_variables(tf_path)
thomwolf /
Created Aug 8, 2019
Retrieve OpenAI GPT-2 model
git clone
cd gpt-2
python 117M