Last active
September 25, 2023 02:27
-
-
Save gautierdag/925760d4295080c1860259dba43e4c01 to your computer and use it in GitHub Desktop.
pytorch-lightning Transformer (noam) lr policy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import pytorch_lightning as pl | |
class MyTransformer(pl.LightningModule): | |
def __init__( | |
self, | |
learning_rate=0.001, | |
warmup=4000, | |
): | |
self.learning_rate = learning_rate | |
self.warmup = warmup | |
# | |
# This is an example config on how to do the Noam (Attention is all you need) lr policy | |
# with pytorch lightning. | |
def configure_optimizers(self): | |
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) | |
def warm_decay(step): | |
if step < self.warmup: | |
return step / self.warmup | |
return self.warmup ** 0.5 * step ** -0.5 | |
scheduler = ( | |
{ | |
"scheduler": torch.optim.lr_scheduler.LambdaLR(optimizer, warm_decay), | |
"interval": "step", #runs per batch rather than per epoch | |
"frequency": 1, | |
# "name" : "learning_rate" # uncomment if using LearningRateMonitor | |
} | |
) | |
return [optimizer], [scheduler] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please refer to espnet's implementation. It mentioned that the Noam scheduler used in Attention is All You Need uses a parameter
model_size
to calculate learning rate, which is the hidden size mentioned in my last comment.