turian/pl-binary

## pl-binary
# Number of inputs
NIN = 1000
NHID = 10
# Number of examples
EXAMPLES = 100000

import timeit
import logging
from collections import OrderedDict

import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from torch import optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

import numpy as np
np.random.seed(0)

X = np.random.random((EXAMPLES, NIN))
Y = np.random.random((EXAMPLES, 1))

class BinaryDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x).float()
        self.y = torch.Tensor(y).float()

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index,:], self.y[index]

    def cuda(self):
        self.x = self.x.to('cuda')
        self.y = self.y.to('cuda')

binaryDataset = BinaryDataset(X, Y)

class BinaryModule(pl.LightningModule):
    def __init__(self, binaryDataset):
        super().__init__()

        self.dataset = binaryDataset

        # build model
        self.__build_model()

    def __build_model(self):
        self.fc1 = nn.Linear(NIN, NHID)
        self.do1 = nn.Dropout(0.2)
        self.out = nn.Linear(NHID, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.do1(x)
        x = self.out(x)
        return x

    def loss(self, pred, true):
        loss_val = F.mse_loss(pred, true)
        return loss_val

    def _step(self, batch, batch_idx, name, training_step=False):
        x, y = batch
        pred = self.forward(x)
        loss_val = self.loss(pred, y)
        # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
        if self.trainer.use_dp or self.trainer.use_ddp2:
            loss_val = loss_val.unsqueeze(0)
        tqdm_dict = OrderedDict({name: loss_val})
        if training_step:
            return OrderedDict({
                'loss': loss_val,
                'progress_bar': tqdm_dict,
                'log': tqdm_dict
            })
        else:
            return tqdm_dict

    def training_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, name="train_loss", training_step=True)

    def _epoch_end(self, outputs, name):
        # With DP training I think you have to average the things individually? Not sure
        # Look at the pytorch lightning siamese network code
        #if self.trainer.use_dp or self.trainer.use_ddp2:
        #    val_acc = torch.mean(val_acc)
        avg_loss = torch.stack([x[name] for x in outputs]).mean()
        tqdm_dict = {name: avg_loss}
        result = OrderedDict({name: avg_loss, 'progress_bar': tqdm_dict, 'log': tqdm_dict})
        return result

    # ---------------------
    # TRAINING SETUP
    # ---------------------
    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(),
                             lr=0.01, momentum=0.90)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=10)
        return [optimizer], [scheduler]

    def __dataloader(self, train, dataset):
        # when using multi-node (ddp) we need to add the  datasampler
        train_sampler = None

        if self.use_ddp:
            train_sampler = DistributedSampler(dataset)

        should_shuffle = train and train_sampler is None
        loader = DataLoader(
            dataset=dataset,
            batch_size=len(dataset),
            shuffle=should_shuffle,
            sampler=train_sampler,
            num_workers=0,
            drop_last=True
        )

        return loader

    @pl.data_loader
    def train_dataloader(self):
        logging.info('training data loader called')
        return self.__dataloader(train=True, dataset=self.dataset)

def fit(): trainer_gpu.fit(model_gpu)

model_gpu = BinaryModule(binaryDataset)
trainer_gpu = Trainer(max_epochs=10, gpus=1)
print(timeit.timeit(fit, number=1))
print("Don't load to GPU")

binaryDataset.cuda()
model_gpu = BinaryModule(binaryDataset)
trainer_gpu = Trainer(max_epochs=10, gpus=1)
print(timeit.timeit(fit, number=1))
print("Load to GPU")

## pl-siamese.py
# Number of inputs
NIN = 100
NHID = 1000
# Size of the learned representation
NOUT = 200
# Number of examples
EXAMPLES = 100000
# Batch size
BATCH_SIZE = 1000

import timeit
import logging
from collections import OrderedDict

import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from torch import optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

import numpy as np
np.random.seed(0)

X1 = np.random.random((EXAMPLES, NIN))
X2 = np.random.random((EXAMPLES, NIN))
DIST = np.random.random((EXAMPLES,))

class TableDistanceDataset(torch.utils.data.Dataset):
    def __init__(self, x1, x2, dist):
        self.dist = torch.Tensor(dist).float()
        self.X1 = torch.Tensor(X1).float()
        self.X2 = torch.Tensor(X2).float()

    def __len__(self):
        return self.X1.shape[0]

    def __getitem__(self, index):
        return self.X1[index,:], self.X2[index,:], self.dist[index]

    def cuda(self):
        self.dist = self.dist.to('cuda')
        self.X1 = self.X1.to('cuda')
        self.X2 = self.X2.to('cuda')

tableDistanceDataset = TableDistanceDataset(X1, X2, DIST)

class Table2Representation(pl.LightningModule):
    def __init__(self):
        super().__init__()

        # build model
        self.__build_model()

    def __build_model(self):
        self.fc1 = nn.Linear(NIN, NHID)
        self.do1 = nn.Dropout(0.2)
        self.out = nn.Linear(NHID, NOUT)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.do1(x)
        x = self.out(x)
        return x

# Based upon https://github.com/PyTorchLightning/Siamese-Neural-Networks/blob/master/model.py
class TableDistanceModule(pl.LightningModule):
    def __init__(self, tableDistanceDataset):
        super().__init__()

        self.dataset = tableDistanceDataset
        self.table2Representation = Table2Representation()

        # build model
        self.__build_model()

    def __build_model(self):
        pass

    def forward(self, x1, x2):
        z1 = self.table2Representation.forward(x1)
        z2 = self.table2Representation.forward(x2)
        dis = torch.mean(torch.abs(z1 - z2), axis=1)
        return dis

    def loss(self, pred_dists, true_dists):
        loss_val = F.mse_loss(pred_dists, true_dists)
        return loss_val

    def _step(self, batch, batch_idx, name, training_step=False):
        X1, X2, dist = batch
        pred = self.forward(X1, X2)
        loss_val = self.loss(pred, dist)
        # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
        if self.trainer.use_dp or self.trainer.use_ddp2:
            loss_val = loss_val.unsqueeze(0)
        tqdm_dict = OrderedDict({name: loss_val})
        if training_step:
            return OrderedDict({
                'loss': loss_val,
                'progress_bar': tqdm_dict,
                'log': tqdm_dict
            })
        else:
            return tqdm_dict

    def training_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, name="train_loss", training_step=True)

    def _epoch_end(self, outputs, name):
        # With DP training I think you have to average the things individually? Not sure
        # Look at the pytorch lightning siamese network code
        #if self.trainer.use_dp or self.trainer.use_ddp2:
        #    val_acc = torch.mean(val_acc)
        avg_loss = torch.stack([x[name] for x in outputs]).mean()
        tqdm_dict = {name: avg_loss}
        result = OrderedDict({name: avg_loss, 'progress_bar': tqdm_dict, 'log': tqdm_dict})
        return result

    # ---------------------
    # TRAINING SETUP
    # ---------------------
    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(),
                             lr=0.01, momentum=0.90)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=10)
        return [optimizer], [scheduler]

    def __dataloader(self, train, dataset):
        # when using multi-node (ddp) we need to add the  datasampler
        train_sampler = None
        batch_size = BATCH_SIZE

        if self.use_ddp:
            train_sampler = DistributedSampler(dataset)

        should_shuffle = train and train_sampler is None
        loader = DataLoader(
            dataset=dataset,
            batch_size=len(dataset),
            shuffle=should_shuffle,
            sampler=train_sampler,
            num_workers=0,
            drop_last=True
        )

        return loader

    @pl.data_loader
    def train_dataloader(self):
        logging.info('training data loader called')
        return self.__dataloader(train=True, dataset=self.dataset)

def fit(): trainer_gpu.fit(model_gpu)

model_gpu = TableDistanceModule(tableDistanceDataset)
trainer_gpu = Trainer(max_epochs=10, gpus=1)
print("Don't load", timeit.timeit(fit, number=1))

tableDistanceDataset.cuda()
model_gpu = TableDistanceModule(tableDistanceDataset)
trainer_gpu = Trainer(max_epochs=10, gpus=1)
print("Load", timeit.timeit(fit, number=1))
	# Number of inputs
	NIN = 1000
	NHID = 10
	# Number of examples
	EXAMPLES = 100000

	import timeit
	import logging
	from collections import OrderedDict

	import pytorch_lightning as pl
	from pytorch_lightning import LightningModule
	from pytorch_lightning import Trainer
	from torch import optim
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader
	from torch.utils.data.distributed import DistributedSampler

	import numpy as np
	np.random.seed(0)

	X = np.random.random((EXAMPLES, NIN))
	Y = np.random.random((EXAMPLES, 1))

	class BinaryDataset(torch.utils.data.Dataset):
	def __init__(self, x, y):
	self.x = torch.Tensor(x).float()
	self.y = torch.Tensor(y).float()

	def __len__(self):
	return self.x.shape[0]

	def __getitem__(self, index):
	return self.x[index,:], self.y[index]

	def cuda(self):
	self.x = self.x.to('cuda')
	self.y = self.y.to('cuda')

	binaryDataset = BinaryDataset(X, Y)

	class BinaryModule(pl.LightningModule):
	def __init__(self, binaryDataset):
	super().__init__()

	self.dataset = binaryDataset

	# build model
	self.__build_model()

	def __build_model(self):
	self.fc1 = nn.Linear(NIN, NHID)
	self.do1 = nn.Dropout(0.2)
	self.out = nn.Linear(NHID, 1)

	def forward(self, x):
	x = F.relu(self.fc1(x))
	x = self.do1(x)
	x = self.out(x)
	return x

	def loss(self, pred, true):
	loss_val = F.mse_loss(pred, true)
	return loss_val

	def _step(self, batch, batch_idx, name, training_step=False):
	x, y = batch
	pred = self.forward(x)
	loss_val = self.loss(pred, y)
	# in DP mode (default) make sure if result is scalar, there's another dim in the beginning
	if self.trainer.use_dp or self.trainer.use_ddp2:
	loss_val = loss_val.unsqueeze(0)
	tqdm_dict = OrderedDict({name: loss_val})
	if training_step:
	return OrderedDict({
	'loss': loss_val,
	'progress_bar': tqdm_dict,
	'log': tqdm_dict
	})
	else:
	return tqdm_dict

	def training_step(self, batch, batch_idx):
	return self._step(batch, batch_idx, name="train_loss", training_step=True)

	def _epoch_end(self, outputs, name):
	# With DP training I think you have to average the things individually? Not sure
	# Look at the pytorch lightning siamese network code
	#if self.trainer.use_dp or self.trainer.use_ddp2:
	# val_acc = torch.mean(val_acc)
	avg_loss = torch.stack([x[name] for x in outputs]).mean()
	tqdm_dict = {name: avg_loss}
	result = OrderedDict({name: avg_loss, 'progress_bar': tqdm_dict, 'log': tqdm_dict})
	return result

	# ---------------------
	# TRAINING SETUP
	# ---------------------
	def configure_optimizers(self):
	optimizer = optim.SGD(self.parameters(),
	lr=0.01, momentum=0.90)
	scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
	T_max=10)
	return [optimizer], [scheduler]

	def __dataloader(self, train, dataset):
	# when using multi-node (ddp) we need to add the datasampler
	train_sampler = None

	if self.use_ddp:
	train_sampler = DistributedSampler(dataset)

	should_shuffle = train and train_sampler is None
	loader = DataLoader(
	dataset=dataset,
	batch_size=len(dataset),
	shuffle=should_shuffle,
	sampler=train_sampler,
	num_workers=0,
	drop_last=True
	)

	return loader

	@pl.data_loader
	def train_dataloader(self):
	logging.info('training data loader called')
	return self.__dataloader(train=True, dataset=self.dataset)

	def fit(): trainer_gpu.fit(model_gpu)

	model_gpu = BinaryModule(binaryDataset)
	trainer_gpu = Trainer(max_epochs=10, gpus=1)
	print(timeit.timeit(fit, number=1))
	print("Don't load to GPU")

	binaryDataset.cuda()
	model_gpu = BinaryModule(binaryDataset)
	trainer_gpu = Trainer(max_epochs=10, gpus=1)
	print(timeit.timeit(fit, number=1))
	print("Load to GPU")
	# Number of inputs
	NIN = 100
	NHID = 1000
	# Size of the learned representation
	NOUT = 200
	# Number of examples
	EXAMPLES = 100000
	# Batch size
	BATCH_SIZE = 1000

	import timeit
	import logging
	from collections import OrderedDict

	import pytorch_lightning as pl
	from pytorch_lightning import LightningModule
	from pytorch_lightning import Trainer
	from torch import optim
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader
	from torch.utils.data.distributed import DistributedSampler

	import numpy as np
	np.random.seed(0)

	X1 = np.random.random((EXAMPLES, NIN))
	X2 = np.random.random((EXAMPLES, NIN))
	DIST = np.random.random((EXAMPLES,))

	class TableDistanceDataset(torch.utils.data.Dataset):
	def __init__(self, x1, x2, dist):
	self.dist = torch.Tensor(dist).float()
	self.X1 = torch.Tensor(X1).float()
	self.X2 = torch.Tensor(X2).float()

	def __len__(self):
	return self.X1.shape[0]

	def __getitem__(self, index):
	return self.X1[index,:], self.X2[index,:], self.dist[index]

	def cuda(self):
	self.dist = self.dist.to('cuda')
	self.X1 = self.X1.to('cuda')
	self.X2 = self.X2.to('cuda')

	tableDistanceDataset = TableDistanceDataset(X1, X2, DIST)

	class Table2Representation(pl.LightningModule):
	def __init__(self):
	super().__init__()

	# build model
	self.__build_model()

	def __build_model(self):
	self.fc1 = nn.Linear(NIN, NHID)
	self.do1 = nn.Dropout(0.2)
	self.out = nn.Linear(NHID, NOUT)

	def forward(self, x):
	x = F.relu(self.fc1(x))
	x = self.do1(x)
	x = self.out(x)
	return x

	# Based upon https://github.com/PyTorchLightning/Siamese-Neural-Networks/blob/master/model.py
	class TableDistanceModule(pl.LightningModule):
	def __init__(self, tableDistanceDataset):
	super().__init__()

	self.dataset = tableDistanceDataset
	self.table2Representation = Table2Representation()

	# build model
	self.__build_model()

	def __build_model(self):
	pass

	def forward(self, x1, x2):
	z1 = self.table2Representation.forward(x1)
	z2 = self.table2Representation.forward(x2)
	dis = torch.mean(torch.abs(z1 - z2), axis=1)
	return dis

	def loss(self, pred_dists, true_dists):
	loss_val = F.mse_loss(pred_dists, true_dists)
	return loss_val

	def _step(self, batch, batch_idx, name, training_step=False):
	X1, X2, dist = batch
	pred = self.forward(X1, X2)
	loss_val = self.loss(pred, dist)
	# in DP mode (default) make sure if result is scalar, there's another dim in the beginning
	if self.trainer.use_dp or self.trainer.use_ddp2:
	loss_val = loss_val.unsqueeze(0)
	tqdm_dict = OrderedDict({name: loss_val})
	if training_step:
	return OrderedDict({
	'loss': loss_val,
	'progress_bar': tqdm_dict,
	'log': tqdm_dict
	})
	else:
	return tqdm_dict

	def training_step(self, batch, batch_idx):
	return self._step(batch, batch_idx, name="train_loss", training_step=True)

	def _epoch_end(self, outputs, name):
	# With DP training I think you have to average the things individually? Not sure
	# Look at the pytorch lightning siamese network code
	#if self.trainer.use_dp or self.trainer.use_ddp2:
	# val_acc = torch.mean(val_acc)
	avg_loss = torch.stack([x[name] for x in outputs]).mean()
	tqdm_dict = {name: avg_loss}
	result = OrderedDict({name: avg_loss, 'progress_bar': tqdm_dict, 'log': tqdm_dict})
	return result

	# ---------------------
	# TRAINING SETUP
	# ---------------------
	def configure_optimizers(self):
	optimizer = optim.SGD(self.parameters(),
	lr=0.01, momentum=0.90)
	scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
	T_max=10)
	return [optimizer], [scheduler]

	def __dataloader(self, train, dataset):
	# when using multi-node (ddp) we need to add the datasampler
	train_sampler = None
	batch_size = BATCH_SIZE

	if self.use_ddp:
	train_sampler = DistributedSampler(dataset)

	should_shuffle = train and train_sampler is None
	loader = DataLoader(
	dataset=dataset,
	batch_size=len(dataset),
	shuffle=should_shuffle,
	sampler=train_sampler,
	num_workers=0,
	drop_last=True
	)

	return loader

	@pl.data_loader
	def train_dataloader(self):
	logging.info('training data loader called')
	return self.__dataloader(train=True, dataset=self.dataset)

	def fit(): trainer_gpu.fit(model_gpu)

	model_gpu = TableDistanceModule(tableDistanceDataset)
	trainer_gpu = Trainer(max_epochs=10, gpus=1)
	print("Don't load", timeit.timeit(fit, number=1))

	tableDistanceDataset.cuda()
	model_gpu = TableDistanceModule(tableDistanceDataset)
	trainer_gpu = Trainer(max_epochs=10, gpus=1)
	print("Load", timeit.timeit(fit, number=1))