Skip to content

Instantly share code, notes, and snippets.

View thomwolf's full-sized avatar
🚂
training

Thomas Wolf thomwolf

🚂
training
View GitHub Profile
@thomwolf
thomwolf / loading-weights-gpt-2.py
Last active February 8, 2023 19:01
Loading TensorFlow weights in a PyTorch model
import re
import numpy as np
import tensorflow as tf
model = MyPyTorchGPT2() # load the un-initialized PyTorch model we have created
# Retrieve weights from TF checkpoint
tf_path = os.path.abspath(gpt2_checkpoint_path)
init_vars = tf.train.list_variables(tf_path)
tf_vars = []
@thomwolf
thomwolf / add_special_tokens.py
Last active February 5, 2023 03:09
Add special tokens to our model
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
# We can add these special tokens to the vocabulary and the embeddings of the model:
tokenizer.set_special_tokens(SPECIAL_TOKENS)
@thomwolf
thomwolf / datadistributedparallel.py
Last active December 13, 2022 19:15
Using DistributedDataParallel
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader
# Each process runs on 1 GPU device specified by the local_rank argument.
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
@thomwolf
thomwolf / knowledge_distilation.py
Last active July 12, 2022 22:21
Knowledge Distilation
import torch
import torch.nn as nn
from torch.optim import Optimizer
KD_loss = nn.KLDivLoss(reduction='batchmean')
def kd_step(teacher: nn.Module, student: nn.Module, temperature: float,
inputs: torch.tensor, optimizer: Optimizer):
teacher.eval()
student.train()
@thomwolf
thomwolf / pytorch_weight_initialization.py
Created October 3, 2017 11:54
Simple way to reproduce Keras default initialisation in a typical pyTorch NLP model
def init_weights(self):
"""
Here we reproduce Keras default initialization weights to initialize Embeddings/LSTM weights
"""
ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
b = (param.data for name, param in self.named_parameters() if 'bias' in name)
nn.init.uniform(self.embed.weight.data, a=-0.5, b=0.5)
for t in ih:
nn.init.xavier_uniform(t)
@thomwolf
thomwolf / attention_layer_pytorch.py
Last active January 25, 2021 00:51
A pyTorch attention layer for torchMoji model
class Attention(Module):
"""
Computes a weighted average of channels across timesteps (1 parameter pr. channel).
"""
def __init__(self, attention_size, return_attention=False):
""" Initialize the attention layer
# Arguments:
attention_size: Size of the attention vector.
return_attention: If true, output will include the weight for each input token
used for the prediction
@thomwolf
thomwolf / prepare_packed_sequence.py
Created October 3, 2017 10:55
Preparer a pyTorch PackedSequence for a batch of sequences
# input_seqs is a batch of input sequences as a numpy array of integers (word indices in vocabulary) padded with zeroas
input_seqs = Variable(torch.from_numpy(input_seqs.astype('int64')).long())
# First: order the batch by decreasing sequence length
input_lengths = torch.LongTensor([torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0])])
input_lengths, perm_idx = input_lengths.sort(0, descending=True)
input_seqs = input_seqs[perm_idx][:, :input_lengths.max()]
# Then pack the sequences
packed_input = pack_padded_sequence(input_seqs, input_lengths.cpu().numpy(), batch_first=True)
@thomwolf
thomwolf / fast_loop.pyx
Last active January 10, 2021 15:59
A Cython loop on an array of C structs
from cymem.cymem cimport Pool
from random import random
cdef struct Rectangle:
float w
float h
cdef int check_rectangles(Rectangle* rectangles, int n_rectangles, float threshold):
cdef int n_out = 0
# C arrays contain no size information => we need to give it explicitly
@thomwolf
thomwolf / profile.py
Created June 4, 2018 08:31
Profiling a Python module
import cProfile
import pstats
import my_slow_module
cProfile.run('my_slow_module.run()', 'restats')
p = pstats.Stats('restats')
p.sort_stats('cumulative').print_stats(30)
@thomwolf
thomwolf / pytorch_training.py
Last active December 18, 2019 07:20
A simple PyTorch training loop
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss.backward() # Backward pass
optimizer.step() # Optimizer step
predictions = model(inputs) # Forward pass with new parameters