Skip to content

Instantly share code, notes, and snippets.

View thomwolf's full-sized avatar
🚂
training

Thomas Wolf thomwolf

🚂
training
View GitHub Profile
@thomwolf
thomwolf / datadistributedparallel.py
Last active December 13, 2022 19:15
Using DistributedDataParallel
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader
# Each process runs on 1 GPU device specified by the local_rank argument.
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
@thomwolf
thomwolf / Using_parallel.py
Last active April 9, 2019 12:49
Using a parallel model and a parallel criterion in Pytorch
from parallel import DataParallelModel, DataParallelCriterion
parallel_model = DataParallelModel(model) # Encapsulate the model
parallel_loss = DataParallelCriterion(loss_function) # Encapsulate the loss function
predictions = parallel_model(inputs) # Parallel forward pass
# "predictions" is a tuple of n_gpu tensors
loss = parallel_loss(predictions, labels) # Compute loss function in parallel
loss.backward() # Backward pass
optimizer.step() # Optimizer step
@thomwolf
thomwolf / parallel.py
Last active August 8, 2023 15:50
Data Parallelism in PyTorch for modules and losses
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang, Rutgers University, Email: zhang.hang@rutgers.edu
## Modified by Thomas Wolf, HuggingFace Inc., Email: thomas@huggingface.co
## Copyright (c) 2017-2018
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Data Parallel"""
@thomwolf
thomwolf / dataparallel_pytorch.py
Last active November 9, 2018 10:39
Pytorch nn.DataParallel
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model
predictions = parallel_model(inputs) # Forward pass on multi-GPUs
loss = loss_function(predictions, labels) # Compute loss function
loss.mean().backward() # Average GPU-losses + backward pass
optimizer.step() # Optimizer step
predictions = parallel_model(inputs) # Forward pass with new parameters
@thomwolf
thomwolf / gradient_accumulation.py
Last active January 16, 2024 02:38
PyTorch gradient accumulation training loop
model.zero_grad() # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss = loss / accumulation_steps # Normalize our loss (if averaged)
loss.backward() # Backward pass
if (i+1) % accumulation_steps == 0: # Wait for several backward steps
optimizer.step() # Now we can do an optimizer step
model.zero_grad() # Reset gradients tensors
if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
@thomwolf
thomwolf / pytorch_training.py
Last active December 18, 2019 07:20
A simple PyTorch training loop
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss.backward() # Backward pass
optimizer.step() # Optimizer step
predictions = model(inputs) # Forward pass with new parameters
@thomwolf
thomwolf / AdamW.py
Created July 3, 2018 21:20
Implements Adam algorithm with weight decay fix in PyTorch (paper: https://arxiv.org/abs/1711.05101)
from torch.optim import Optimizer
class AdamW(Optimizer):
"""
Implements Adam algorithm with weight decay fix in PyTorch
Paper: Fixing Weight Decay Regularization in Adam by Ilya Loshchilov, Frank Hutter
https://arxiv.org/abs/1711.05101
"""
def __init__(self, params, lr, b1=0.9, b2=0.999, e=1e-8, l2=0,
vector_l2=False, max_grad_norm=-1, **kwargs):
@thomwolf
thomwolf / download_big_doc.py
Last active June 7, 2018 08:28
Download and parse an extract of wikitext-2 with ~170k words
import urllib.request
import spacy
with urllib.request.urlopen('https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt') as response:
text = response.read()
nlp = spacy.load('en')
doc_list = list(nlp(text[:800000].decode('utf8')) for i in range(10))
@thomwolf
thomwolf / fast_cython_loop.pyx
Last active June 11, 2018 13:50
Example of Cython loop to count the occurrences of the word "run" tagged as noun by spaCy in a portion of Wikitext2
%%cython -+
import numpy # Sometime we have a fail to import numpy compilation error if we don't import numpy
from cymem.cymem cimport Pool
from spacy.tokens.doc cimport Doc
from spacy.typedefs cimport hash_t
from spacy.structs cimport TokenC
cdef struct DocElement:
TokenC* c
int length
@thomwolf
thomwolf / slow_python_loop.py
Last active June 6, 2018 22:46
Example of Python loop to count the occurrences of the word "run" tagged as noun by spaCy in a portion of Wikitext2
def slow_loop(doc_list, word, tag):
n_out = 0
for doc in doc_list:
for tok in doc:
if tok.lower_ == word and tok.tag_ == tag:
n_out += 1
return n_out
def main_nlp_slow(doc_list):
n_out = slow_loop(doc_list, 'run', 'NN')