Thomas Wolf thomwolf

## loading-weights-gpt-2.py
import re
import numpy as np
import tensorflow as tf

model = MyPyTorchGPT2()  # load the un-initialized PyTorch model we have created

# Retrieve weights from TF checkpoint
tf_path = os.path.abspath(gpt2_checkpoint_path)
init_vars = tf.train.list_variables(tf_path)
tf_vars = []

## add_special_tokens.py
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

# We can add these special tokens to the vocabulary and the embeddings of the model:
tokenizer.set_special_tokens(SPECIAL_TOKENS)

## datadistributedparallel.py
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

# Each process runs on 1 GPU device specified by the local_rank argument.
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()

# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')

## knowledge_distilation.py
import torch
import torch.nn as nn
from torch.optim import Optimizer

KD_loss = nn.KLDivLoss(reduction='batchmean')

def kd_step(teacher: nn.Module, student: nn.Module, temperature: float,
            inputs: torch.tensor, optimizer: Optimizer):
    teacher.eval()
    student.train()

## pytorch_weight_initialization.py
def init_weights(self):
    """
    Here we reproduce Keras default initialization weights to initialize Embeddings/LSTM weights
    """
    ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
    hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
    b = (param.data for name, param in self.named_parameters() if 'bias' in name)
    nn.init.uniform(self.embed.weight.data, a=-0.5, b=0.5)
    for t in ih:
        nn.init.xavier_uniform(t)

## attention_layer_pytorch.py
class Attention(Module):
    """
    Computes a weighted average of channels across timesteps (1 parameter pr. channel).
    """
    def __init__(self, attention_size, return_attention=False):
        """ Initialize the attention layer
        # Arguments:
            attention_size: Size of the attention vector.
            return_attention: If true, output will include the weight for each input token
                              used for the prediction

## prepare_packed_sequence.py
# input_seqs is a batch of input sequences as a numpy array of integers (word indices in vocabulary) padded with zeroas
input_seqs = Variable(torch.from_numpy(input_seqs.astype('int64')).long())

# First: order the batch by decreasing sequence length
input_lengths = torch.LongTensor([torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0])])
input_lengths, perm_idx = input_lengths.sort(0, descending=True)
input_seqs = input_seqs[perm_idx][:, :input_lengths.max()]

# Then pack the sequences
packed_input = pack_padded_sequence(input_seqs, input_lengths.cpu().numpy(), batch_first=True)

## fast_loop.pyx
from cymem.cymem cimport Pool
from random import random

cdef struct Rectangle:
    float w
    float h

cdef int check_rectangles(Rectangle* rectangles, int n_rectangles, float threshold):
    cdef int n_out = 0
    # C arrays contain no size information => we need to give it explicitly

## profile.py
import cProfile
import pstats
import my_slow_module
cProfile.run('my_slow_module.run()', 'restats')
p = pstats.Stats('restats')
p.sort_stats('cumulative').print_stats(30)

## pytorch_training.py
predictions = model(inputs)               # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss.backward()                           # Backward pass
optimizer.step()                          # Optimizer step
predictions = model(inputs)               # Forward pass with new parameters
	import re
	import numpy as np
	import tensorflow as tf

	model = MyPyTorchGPT2() # load the un-initialized PyTorch model we have created

	# Retrieve weights from TF checkpoint
	tf_path = os.path.abspath(gpt2_checkpoint_path)
	init_vars = tf.train.list_variables(tf_path)
	tf_vars = []
	# We will use 5 special tokens:
	# - <bos> to indicate the start of the sequence
	# - <eos> to indicate the end of the sequence
	# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
	# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
	# - <pad> as a padding token to build batches of sequences
	SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

	# We can add these special tokens to the vocabulary and the embeddings of the model:
	tokenizer.set_special_tokens(SPECIAL_TOKENS)
	from torch.utils.data.distributed import DistributedSampler
	from torch.utils.data import DataLoader

	# Each process runs on 1 GPU device specified by the local_rank argument.
	parser = argparse.ArgumentParser()
	parser.add_argument("--local_rank", type=int)
	args = parser.parse_args()

	# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
	torch.distributed.init_process_group(backend='nccl')
	import torch
	import torch.nn as nn
	from torch.optim import Optimizer

	KD_loss = nn.KLDivLoss(reduction='batchmean')

	def kd_step(teacher: nn.Module, student: nn.Module, temperature: float,
	inputs: torch.tensor, optimizer: Optimizer):
	teacher.eval()
	student.train()
	def init_weights(self):
	"""
	Here we reproduce Keras default initialization weights to initialize Embeddings/LSTM weights
	"""
	ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
	hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
	b = (param.data for name, param in self.named_parameters() if 'bias' in name)
	nn.init.uniform(self.embed.weight.data, a=-0.5, b=0.5)
	for t in ih:
	nn.init.xavier_uniform(t)
	class Attention(Module):
	"""
	Computes a weighted average of channels across timesteps (1 parameter pr. channel).
	"""
	def __init__(self, attention_size, return_attention=False):
	""" Initialize the attention layer
	# Arguments:
	attention_size: Size of the attention vector.
	return_attention: If true, output will include the weight for each input token
	used for the prediction
	# input_seqs is a batch of input sequences as a numpy array of integers (word indices in vocabulary) padded with zeroas
	input_seqs = Variable(torch.from_numpy(input_seqs.astype('int64')).long())

	# First: order the batch by decreasing sequence length
	input_lengths = torch.LongTensor([torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0])])
	input_lengths, perm_idx = input_lengths.sort(0, descending=True)
	input_seqs = input_seqs[perm_idx][:, :input_lengths.max()]

	# Then pack the sequences
	packed_input = pack_padded_sequence(input_seqs, input_lengths.cpu().numpy(), batch_first=True)
	from cymem.cymem cimport Pool
	from random import random

	cdef struct Rectangle:
	float w
	float h

	cdef int check_rectangles(Rectangle* rectangles, int n_rectangles, float threshold):
	cdef int n_out = 0
	# C arrays contain no size information => we need to give it explicitly
	import cProfile
	import pstats
	import my_slow_module
	cProfile.run('my_slow_module.run()', 'restats')
	p = pstats.Stats('restats')
	p.sort_stats('cumulative').print_stats(30)
	predictions = model(inputs) # Forward pass
	loss = loss_function(predictions, labels) # Compute loss function
	loss.backward() # Backward pass
	optimizer.step() # Optimizer step
	predictions = model(inputs) # Forward pass with new parameters