tchaton/pytorch_lightning_fsdp.py

## pytorch_lightning_fsdp.py
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap

class MyModel(pl.LightningModule):
    ...
    def configure_sharded_model(self):

        # Created within sharded model context, modules are instantly sharded across processes
        # as soon as they are wrapped with ``wrap`` or ``auto_wrap``
        # Wraps the layer in a Fully Sharded Wrapper automatically
        linear_layer = wrap(nn.Linear(32, 32))

        # For best memory efficiency,
        # add fairscale activation checkpointing
        block = auto_wrap(
            checkpoint_wrapper(
                nn.Sequential(
                    nn.Linear(32, 32),
                    nn.ReLU()
                )
            )
        )
        self.model = nn.Sequential(
            linear_layer,
            nn.ReLU(),
            block
        )

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters())

model = MyModel()
trainer = Trainer(gpus=4, plugins='fsdp', precision=16)
trainer.fit(model)
trainer.test()
trainer.predict()
	import torch
	import torch.nn as nn
	import pytorch_lightning as pl
	from pytorch_lightning import Trainer
	from fairscale.nn import checkpoint_wrapper, auto_wrap, wrap

	class MyModel(pl.LightningModule):
	...
	def configure_sharded_model(self):

	# Created within sharded model context, modules are instantly sharded across processes
	# as soon as they are wrapped with ``wrap`` or ``auto_wrap``
	# Wraps the layer in a Fully Sharded Wrapper automatically
	linear_layer = wrap(nn.Linear(32, 32))

	# For best memory efficiency,
	# add fairscale activation checkpointing
	block = auto_wrap(
	checkpoint_wrapper(
	nn.Sequential(
	nn.Linear(32, 32),
	nn.ReLU()
	)
	)
	)
	self.model = nn.Sequential(
	linear_layer,
	nn.ReLU(),
	block
	)

	def configure_optimizers(self):
	return torch.optim.AdamW(self.model.parameters())

	model = MyModel()
	trainer = Trainer(gpus=4, plugins='fsdp', precision=16)
	trainer.fit(model)
	trainer.test()
	trainer.predict()