Load full English Wikipedia dataset in HuggingFace nlp library
 import os; import psutil; import timeit from datasets import load_dataset mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20 wiki = load_dataset("wikipedia", "20200501.en", split='train') mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20 print(f"RAM memory used: {(mem_after - mem_before)} MB") s = """batch_size = 1000 for i in range(0, len(wiki), batch_size):
Knowledge Distilation
 import torch import torch.nn as nn from torch.optim import Optimizer KD_loss = nn.KLDivLoss(reduction='batchmean') def kd_step(teacher: nn.Module, student: nn.Module, temperature: float, inputs: torch.tensor, optimizer: Optimizer): teacher.eval() student.train()
Compare the hidden-states of the TensorFlow and PyTorch models
 # Get the tensorflow and pytorch hidden-states as NumPy arrays tensorflow_hidden_states = sess.run(feed_dict) pytorch_hidden_states = pytorch_model(inputs) pytorch_hidden_states = pytorch_hidden_states.cpu().detach().numpy() # Compute the maximum absolute difference between hidden-states. # Should be less than 1e-3. Typically around 1e-5/1e-6. max_absolute_diff = np.amax(np.abs(tensorflow_hidden_states - pytorch_hidden_states))
 import re import numpy as np import tensorflow as tf model = MyPyTorchGPT2() # load the un-initialized PyTorch model we have created # Retrieve weights from TF checkpoint tf_path = os.path.abspath(gpt2_checkpoint_path) init_vars = tf.train.list_variables(tf_path) tf_vars = []
Main forward pass for GPT-2
 def forward(self, input_ids): position_ids = torch.arange(0, input_ids.size(-1), dtype=torch.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) hidden_states = self.wte(input_ids) + self.wpe(position_ids) hidden_states = self.drop(hidden_states) for block in self.h: hidden_states = block(hidden_states) hidden_states = self.ln_f(hidden_states)
GPT-2 PyTorch block module
 class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super(Block, self).__init__() nx = config.n_embd self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) def forward(self, x):
GPT-2 TensorFlow block class
 def block(x, scope, *, past, hparams): with tf.variable_scope(scope): nx = x.shape[-1].value a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams) x = x + a m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams) x = x + m return x, present
GPT-2 main model class
 class GPT2Model(nn.Module): def __init__(self, config): super(GPT2Model, self).__init__(config) self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
