romovpa/dp-bidaf-issue.py Secret

## dp-bidaf-issue.py
"""
pip install --quiet transformers annoy opacus nltk torchtext spacy
"""

import json
import os

import nltk
import numpy as np
import pandas as pd
import spacy

nltk.download("punkt")
import torch
from annoy import AnnoyIndex
from torchtext import datasets
from torchtext.legacy import data
from torchtext.legacy.data import BucketIterator
from torchtext.vocab import GloVe, Vectors
from torchtext.vocab import vocab as Vocab
from tqdm.notebook import tqdm
from transformers import BertForQuestionAnswering, BertTokenizer

tqdm.pandas()

from torch.utils.data import (
    BatchSampler,
    DataLoader,
    Dataset,
    RandomSampler,
    SequentialSampler,
)


class Config(object):
    embed_size = 300
    hidden_layers = 3
    input_dim = 1150
    hidden_dim = 575
    # hidden_size = 32
    bidirectional = True
    output_size = 2  # 4
    max_epochs = 5
    tie_dims = False

    batch_size = 64
    max_length = None
    vocab_size = 50000
    pretrained_embedding = "glove.6B.100d"
    freeze = True

    embed_dp = 0.05
    weight_dp = 0.5
    locked_dpi = 0.4
    locked_dph = 0.3
    locked_dpo = 0.4

    lr = 1e-3  # 1e-4
    betas = (0.7, 0.99)  # Deep Biaffine Attention for Neural Dependency Parsing
    weight_decay = 0
    momentum = 0.9  # SGD only

    # STEPLR
    step_size = 1
    gamma = 0.5

    # Slanted Triangular Scheduling
    num_cycles = 0.5
    num_training_steps = 25000 / batch_size * max_epochs
    num_warmup_steps = num_training_steps * 0.1

    ## PRIVACY (ε ≈ 8)

    data_noise = 10
    model_noise = 0.45
    label_noise = 0.05  # 1 / epsilon
    delta = 1e-5  # 1 / len(data)
    rdp_alphas = [1 + x / 10.0 for x in range(1, 100)] + list(range(11, 100))

    grad_clipping = 1.0
    grad_accumulation = 1

    # PATE

    query_dataset = 500  # [100, 500, 1000]

    teacher_count = 2  # 100   # [10, 100, 250]
    teacher_epochs = 1  # 5
    teacher_batch_size = 32
    teacher_intervall = 1

    student_count = 2
    student_epochs = 15
    student_batch_size = 32
    student_intervall = 1


import torch
import torch.nn as nn
import torch.nn.functional as F
from opacus.layers import DPLSTM as LSTM


class BiDAF(nn.Module):
    def __init__(
        self, config, char_size, word_size, embed_size, embedding, private=False
    ):

        super(BiDAF, self).__init__()

        self.config = config

        # 1. Character Embedding Layer
        self.char_emb = nn.Embedding(char_size, 8, padding_idx=1)
        nn.init.uniform_(self.char_emb.weight, -0.001, 0.001)

        self.char_conv = nn.Sequential(nn.Conv2d(1, 100, (8, 5)), nn.ReLU())

        # 2. Word Embedding Layer
        # initialize word embedding with GloVe
        self.word_emb = nn.Embedding(word_size, embed_size).from_pretrained(
            embedding, freeze=self.config.freeze
        )

        # highway network
        # assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim)
        for i in range(2):
            setattr(
                self,
                "highway_linear{}".format(i),
                nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.ReLU()),
            )
            setattr(
                self,
                "highway_gate{}".format(i),
                nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.Sigmoid()),
            )

        # 3. Contextual Embedding Layer
        ## add dropout

        if private == True:

            self.context_LSTM = LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )
        elif private == False:

            self.context_LSTM = nn.LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )

        # 4. Attention Flow Layer
        self.att_weight_c = nn.Linear(100 * 2, 1)
        self.att_weight_q = nn.Linear(100 * 2, 1)
        self.att_weight_cq = nn.Linear(100 * 2, 1)

        # 5. Modeling Layer

        if private == True:

            self.modeling_LSTM1 = LSTM(
                input_size=100 * 8,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )

            self.modeling_LSTM2 = LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )
        elif private == False:

            self.modeling_LSTM1 = nn.LSTM(
                input_size=100 * 8,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )

            self.modeling_LSTM2 = nn.LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )

        # 6. Output Layer
        self.p1_weight_g = nn.Linear(100 * 8, 1)
        self.p1_weight_m = nn.Linear(100 * 2, 1)
        self.p2_weight_g = nn.Linear(100 * 8, 1)
        self.p2_weight_m = nn.Linear(100 * 2, 1)

        if private == True:

            self.output_LSTM = LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )
        elif private == False:

            self.output_LSTM = nn.LSTM(
                input_size=100 * 2,
                hidden_size=100,
                bidirectional=True,
                batch_first=True,
            )

        self.dropout = nn.Dropout(p=0.2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, batch):
        def char_emb_layer(x):
            """
            :param x: (batch, seq_len, word_len)
            :return: (batch, seq_len, char_channel_size)
            """
            batch_size = x.size(0)
            # (batch, seq_len, word_len, char_dim)
            x = self.dropout(self.char_emb(x))
            # (batch， seq_len, char_dim, word_len)
            x = x.transpose(2, 3)
            # (batch * seq_len, 1, char_dim, word_len)
            x = x.view(-1, 8, x.size(3)).unsqueeze(1)
            # (batch * seq_len, char_channel_size, 1, conv_len) -> (batch * seq_len, char_channel_size, conv_len)
            x = self.char_conv(x).squeeze()
            # (batch * seq_len, char_channel_size, 1) -> (batch * seq_len, char_channel_size)
            x = F.max_pool1d(x, x.size(2)).squeeze()
            # (batch, seq_len, char_channel_size)
            x = x.view(batch_size, -1, 100)

            return x

        def highway_network(x1, x2):
            """
            :param x1: (batch, seq_len, char_channel_size)
            :param x2: (batch, seq_len, word_dim)
            :return: (batch, seq_len, hidden_size * 2)
            """
            # (batch, seq_len, char_channel_size + word_dim)
            x = torch.cat([x1, x2], dim=-1)
            for i in range(2):
                h = getattr(self, "highway_linear{}".format(i))(x)
                g = getattr(self, "highway_gate{}".format(i))(x)
                x = g * h + (1 - g) * x
            # (batch, seq_len, hidden_size * 2)
            return x

        def att_flow_layer(c, q):
            """
            :param c: (batch, c_len, hidden_size * 2)
            :param q: (batch, q_len, hidden_size * 2)
            :return: (batch, c_len, q_len)
            """
            c_len = c.size(1)
            q_len = q.size(1)

            # (batch, c_len, q_len, hidden_size * 2)
            # c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1)
            # (batch, c_len, q_len, hidden_size * 2)
            # q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1)
            # (batch, c_len, q_len, hidden_size * 2)
            # cq_tiled = c_tiled * q_tiled
            # cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1)

            cq = []
            for i in range(q_len):
                # (batch, 1, hidden_size * 2)
                qi = q.select(1, i).unsqueeze(1)
                # (batch, c_len, 1)
                ci = self.att_weight_cq(c * qi).squeeze()
                cq.append(ci)
            # (batch, c_len, q_len)
            cq = torch.stack(cq, dim=-1)

            # (batch, c_len, q_len)
            s = (
                self.att_weight_c(c).expand(-1, -1, q_len)
                + self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1)
                + cq
            )

            # (batch, c_len, q_len)
            a = F.softmax(s, dim=2)
            # (batch, c_len, q_len) * (batch, q_len, hidden_size * 2) -> (batch, c_len, hidden_size * 2)
            c2q_att = torch.bmm(a, q)
            # (batch, 1, c_len)
            b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
            # (batch, 1, c_len) * (batch, c_len, hidden_size * 2) -> (batch, hidden_size * 2)
            q2c_att = torch.bmm(b, c).squeeze()
            # (batch, c_len, hidden_size * 2) (tiled)
            q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1)
            # q2c_att = torch.stack([q2c_att] * c_len, dim=1)

            # (batch, c_len, hidden_size * 8)
            x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1)
            return x

        def output_layer(g, m, l):
            """
            :param g: (batch, c_len, hidden_size * 8)
            :param m: (batch, c_len ,hidden_size * 2)
            :return: p1: (batch, c_len), p2: (batch, c_len)
            """
            # (batch, c_len)
            p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze()
            # (batch, c_len, hidden_size * 2)

            x, x_len = m, l
            x = self.dropout(x)

            x_len_sorted, x_idx = torch.sort(x_len, descending=True)
            x_sorted = x.index_select(dim=0, index=x_idx)
            _, x_ori_idx = torch.sort(x_idx)

            x_packed = nn.utils.rnn.pack_padded_sequence(
                x_sorted, x_len_sorted.cpu(), batch_first=True
            )
            x_packed, (h, _) = self.output_LSTM(x_packed)

            x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
            x = x.index_select(dim=0, index=x_ori_idx)
            h = (
                h.permute(1, 0, 2)
                .contiguous()
                .view(-1, h.size(0) * h.size(2))
                .squeeze()
            )
            h = h.index_select(dim=0, index=x_ori_idx)

            m2 = x  # m2 = self.output_LSTM((m, l))[0]
            # (batch, c_len)
            p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze()

            return p1, p2  # self.softmax(p1), self.softmax(p2)

        # 1. Character Embedding Layer
        # print('char_emb_layer - c')
        c_char = char_emb_layer(batch[0])
        # c_char = char_emb_layer(batch.context_char)
        # print(c_char.shape)
        # print('char_emb_layer - q')
        q_char = char_emb_layer(batch[3])
        # q_char = char_emb_layer(batch.question_char)
        # print(q_char.shape)

        # 2. Word Embedding Layer
        # print('word_emb_layer - c')
        c_word = self.word_emb(batch[1])
        # c_word = self.word_emb(batch.context_word[0])
        # print(c_word.shape)
        # print('word_emb_layer - q')
        q_word = self.word_emb(batch[4])
        # q_word = self.word_emb(batch.question_word[0])
        # print(q_word.shape)

        c_lens = batch[2]
        q_lens = batch[5]
        # c_lens = batch.context_word[1]
        # q_lens = batch.question_word[1]

        # Highway network (combine char- and world-level information)
        # print('highway_network - c')
        c = highway_network(c_char, c_word)
        # print(c.shape)
        # print('highway_network - q')
        q = highway_network(q_char, q_word)
        # print(q.shape)

        # 3. Contextual Embedding Layer
        # print('Contextual Embedding - c')

        x, x_len = c, c_lens

        x_len_sorted, x_idx = torch.sort(x_len, descending=True)
        x_sorted = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(
            x_sorted, x_len_sorted.cpu(), batch_first=True
        )
        x_packed, (h, _) = self.context_LSTM(x_packed)

        x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
        x = x.index_select(dim=0, index=x_ori_idx)
        h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
        h = h.index_select(dim=0, index=x_ori_idx)

        c = x  # c = self.context_LSTM((c, c_lens))[0]
        # print(c.shape)

        # print('Contextual Embedding - q')

        x, x_len = q, q_lens
        x = self.dropout(x)

        x_len_sorted, x_idx = torch.sort(x_len, descending=True)
        x_sorted = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(
            x_sorted, x_len_sorted.cpu(), batch_first=True
        )
        x_packed, (h, _) = self.context_LSTM(x_packed)

        x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
        x = x.index_select(dim=0, index=x_ori_idx)
        h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
        h = h.index_select(dim=0, index=x_ori_idx)

        q = x  # q = self.context_LSTM((q, q_lens))[0]
        # print(q.shape)

        # 4. Attention Flow Layer
        # print('Attention Flow - c and q')
        g = att_flow_layer(c, q)
        # print(g.shape)

        # 5. Modeling Layer
        # print('Modeling Layer - c and q')

        x, x_len = g, c_lens
        x = self.dropout(x)

        x_len_sorted, x_idx = torch.sort(x_len, descending=True)
        x_sorted = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(
            x_sorted, x_len_sorted.cpu(), batch_first=True
        )
        x_packed, (h, _) = self.modeling_LSTM1(x_packed)

        x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
        x = x.index_select(dim=0, index=x_ori_idx)
        h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
        h = h.index_select(dim=0, index=x_ori_idx)

        m = x

        x, x_len = m, c_lens
        x = self.dropout(x)

        x_len_sorted, x_idx = torch.sort(x_len, descending=True)
        x_sorted = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(
            x_sorted, x_len_sorted.cpu(), batch_first=True
        )
        x_packed, (h, _) = self.modeling_LSTM2(x_packed)

        x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
        x = x.index_select(dim=0, index=x_ori_idx)
        h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
        h = h.index_select(dim=0, index=x_ori_idx)

        m = x
        # print(m.shape)

        # 6. Output Layer
        # print('Output Layer')
        p1, p2 = output_layer(g, m, c_lens)

        # print(p1.shape)
        # print(p2.shape)

        # (batch, c_len), (batch, c_len)
        return p1, p2


architecture = None
perturbation = "MODEL"
config = Config()


char_size = torch.load("./content/char_size.pt")
word_size = torch.load("./content/word_size.pt")
embed_size = torch.load("./content/embed_size.pt")
word_embed = torch.load("./content/word_embed.pt")

model = BiDAF(
    config,
    char_size,
    word_size,
    embed_size,
    word_embed,
    private=(True if perturbation == "MODEL" else False),
)

from torch import optim

# optimizer = optim.Adadelta(model.parameters(), lr=0.5)
optimizer = optim.SGD(model.parameters(), lr=0.5)

from opacus import PrivacyEngine

privacy_engine = PrivacyEngine()


# FAKE DATALOADER (Because make_private needs one)
from torchtext.datasets import IMDB
dataloader = DataLoader(IMDB(), batch_size=1)


model, optimizer, dataloader = privacy_engine.make_private(
    model,
    optimizer,
    dataloader,
    noise_multiplier=config.model_noise,
    max_grad_norm=config.grad_clipping,
)


print(
    "Model Parameters (requires_grad: False): {}".format(
        sum(p.numel() for p in model.parameters())
    )
)

print(
    "Model Parameters (requires_grad: True): {}".format(
        sum(p.numel() for p in model.parameters() if p.requires_grad)
    )
)

batch = torch.load("./content/batch.pt")

for el in batch:
    print(el.shape)

p1, p2 = model(batch)

criterion = nn.CrossEntropyLoss()
loss = criterion(p1, batch[-2]) + criterion(p2, batch[-1])
loss.backward()

for n, p in model.named_parameters():
    print(
        "{:50s} {}".format(
            n, list(p.grad_sample.shape) if hasattr(p, "grad_sample") else None
        )
    )

optimizer.step()  # same for virtual step
optimizer.zero_grad()
	"""
	pip install --quiet transformers annoy opacus nltk torchtext spacy
	"""

	import json
	import os

	import nltk
	import numpy as np
	import pandas as pd
	import spacy

	nltk.download("punkt")
	import torch
	from annoy import AnnoyIndex
	from torchtext import datasets
	from torchtext.legacy import data
	from torchtext.legacy.data import BucketIterator
	from torchtext.vocab import GloVe, Vectors
	from torchtext.vocab import vocab as Vocab
	from tqdm.notebook import tqdm
	from transformers import BertForQuestionAnswering, BertTokenizer

	tqdm.pandas()

	from torch.utils.data import (
	BatchSampler,
	DataLoader,
	Dataset,
	RandomSampler,
	SequentialSampler,
	)


	class Config(object):
	embed_size = 300
	hidden_layers = 3
	input_dim = 1150
	hidden_dim = 575
	# hidden_size = 32
	bidirectional = True
	output_size = 2 # 4
	max_epochs = 5
	tie_dims = False

	batch_size = 64
	max_length = None
	vocab_size = 50000
	pretrained_embedding = "glove.6B.100d"
	freeze = True

	embed_dp = 0.05
	weight_dp = 0.5
	locked_dpi = 0.4
	locked_dph = 0.3
	locked_dpo = 0.4

	lr = 1e-3 # 1e-4
	betas = (0.7, 0.99) # Deep Biaffine Attention for Neural Dependency Parsing
	weight_decay = 0
	momentum = 0.9 # SGD only

	# STEPLR
	step_size = 1
	gamma = 0.5

	# Slanted Triangular Scheduling
	num_cycles = 0.5
	num_training_steps = 25000 / batch_size * max_epochs
	num_warmup_steps = num_training_steps * 0.1

	## PRIVACY (ε ≈ 8)

	data_noise = 10
	model_noise = 0.45
	label_noise = 0.05 # 1 / epsilon
	delta = 1e-5 # 1 / len(data)
	rdp_alphas = [1 + x / 10.0 for x in range(1, 100)] + list(range(11, 100))

	grad_clipping = 1.0
	grad_accumulation = 1

	# PATE

	query_dataset = 500 # [100, 500, 1000]

	teacher_count = 2 # 100 # [10, 100, 250]
	teacher_epochs = 1 # 5
	teacher_batch_size = 32
	teacher_intervall = 1

	student_count = 2
	student_epochs = 15
	student_batch_size = 32
	student_intervall = 1


	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from opacus.layers import DPLSTM as LSTM


	class BiDAF(nn.Module):
	def __init__(
	self, config, char_size, word_size, embed_size, embedding, private=False
	):

	super(BiDAF, self).__init__()

	self.config = config

	# 1. Character Embedding Layer
	self.char_emb = nn.Embedding(char_size, 8, padding_idx=1)
	nn.init.uniform_(self.char_emb.weight, -0.001, 0.001)

	self.char_conv = nn.Sequential(nn.Conv2d(1, 100, (8, 5)), nn.ReLU())

	# 2. Word Embedding Layer
	# initialize word embedding with GloVe
	self.word_emb = nn.Embedding(word_size, embed_size).from_pretrained(
	embedding, freeze=self.config.freeze
	)

	# highway network
	# assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim)
	for i in range(2):
	setattr(
	self,
	"highway_linear{}".format(i),
	nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.ReLU()),
	)
	setattr(
	self,
	"highway_gate{}".format(i),
	nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.Sigmoid()),
	)

	# 3. Contextual Embedding Layer
	## add dropout

	if private == True:

	self.context_LSTM = LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)
	elif private == False:

	self.context_LSTM = nn.LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)

	# 4. Attention Flow Layer
	self.att_weight_c = nn.Linear(100 * 2, 1)
	self.att_weight_q = nn.Linear(100 * 2, 1)
	self.att_weight_cq = nn.Linear(100 * 2, 1)

	# 5. Modeling Layer

	if private == True:

	self.modeling_LSTM1 = LSTM(
	input_size=100 * 8,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)

	self.modeling_LSTM2 = LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)
	elif private == False:

	self.modeling_LSTM1 = nn.LSTM(
	input_size=100 * 8,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)

	self.modeling_LSTM2 = nn.LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)

	# 6. Output Layer
	self.p1_weight_g = nn.Linear(100 * 8, 1)
	self.p1_weight_m = nn.Linear(100 * 2, 1)
	self.p2_weight_g = nn.Linear(100 * 8, 1)
	self.p2_weight_m = nn.Linear(100 * 2, 1)

	if private == True:

	self.output_LSTM = LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)
	elif private == False:

	self.output_LSTM = nn.LSTM(
	input_size=100 * 2,
	hidden_size=100,
	bidirectional=True,
	batch_first=True,
	)

	self.dropout = nn.Dropout(p=0.2)
	self.softmax = nn.Softmax(dim=1)

	def forward(self, batch):
	def char_emb_layer(x):
	"""
	:param x: (batch, seq_len, word_len)
	:return: (batch, seq_len, char_channel_size)
	"""
	batch_size = x.size(0)
	# (batch, seq_len, word_len, char_dim)
	x = self.dropout(self.char_emb(x))
	# (batch， seq_len, char_dim, word_len)
	x = x.transpose(2, 3)
	# (batch * seq_len, 1, char_dim, word_len)
	x = x.view(-1, 8, x.size(3)).unsqueeze(1)
	# (batch * seq_len, char_channel_size, 1, conv_len) -> (batch * seq_len, char_channel_size, conv_len)
	x = self.char_conv(x).squeeze()
	# (batch * seq_len, char_channel_size, 1) -> (batch * seq_len, char_channel_size)
	x = F.max_pool1d(x, x.size(2)).squeeze()
	# (batch, seq_len, char_channel_size)
	x = x.view(batch_size, -1, 100)

	return x

	def highway_network(x1, x2):
	"""
	:param x1: (batch, seq_len, char_channel_size)
	:param x2: (batch, seq_len, word_dim)
	:return: (batch, seq_len, hidden_size * 2)
	"""
	# (batch, seq_len, char_channel_size + word_dim)
	x = torch.cat([x1, x2], dim=-1)
	for i in range(2):
	h = getattr(self, "highway_linear{}".format(i))(x)
	g = getattr(self, "highway_gate{}".format(i))(x)
	x = g * h + (1 - g) * x
	# (batch, seq_len, hidden_size * 2)
	return x

	def att_flow_layer(c, q):
	"""
	:param c: (batch, c_len, hidden_size * 2)
	:param q: (batch, q_len, hidden_size * 2)
	:return: (batch, c_len, q_len)
	"""
	c_len = c.size(1)
	q_len = q.size(1)

	# (batch, c_len, q_len, hidden_size * 2)
	# c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1)
	# (batch, c_len, q_len, hidden_size * 2)
	# q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1)
	# (batch, c_len, q_len, hidden_size * 2)
	# cq_tiled = c_tiled * q_tiled
	# cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1)

	cq = []
	for i in range(q_len):
	# (batch, 1, hidden_size * 2)
	qi = q.select(1, i).unsqueeze(1)
	# (batch, c_len, 1)
	ci = self.att_weight_cq(c * qi).squeeze()
	cq.append(ci)
	# (batch, c_len, q_len)
	cq = torch.stack(cq, dim=-1)

	# (batch, c_len, q_len)
	s = (
	self.att_weight_c(c).expand(-1, -1, q_len)
	+ self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1)
	+ cq
	)

	# (batch, c_len, q_len)
	a = F.softmax(s, dim=2)
	# (batch, c_len, q_len) * (batch, q_len, hidden_size * 2) -> (batch, c_len, hidden_size * 2)
	c2q_att = torch.bmm(a, q)
	# (batch, 1, c_len)
	b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
	# (batch, 1, c_len) * (batch, c_len, hidden_size * 2) -> (batch, hidden_size * 2)
	q2c_att = torch.bmm(b, c).squeeze()
	# (batch, c_len, hidden_size * 2) (tiled)
	q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1)
	# q2c_att = torch.stack([q2c_att] * c_len, dim=1)

	# (batch, c_len, hidden_size * 8)
	x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1)
	return x

	def output_layer(g, m, l):
	"""
	:param g: (batch, c_len, hidden_size * 8)
	:param m: (batch, c_len ,hidden_size * 2)
	:return: p1: (batch, c_len), p2: (batch, c_len)
	"""
	# (batch, c_len)
	p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze()
	# (batch, c_len, hidden_size * 2)

	x, x_len = m, l
	x = self.dropout(x)

	x_len_sorted, x_idx = torch.sort(x_len, descending=True)
	x_sorted = x.index_select(dim=0, index=x_idx)
	_, x_ori_idx = torch.sort(x_idx)

	x_packed = nn.utils.rnn.pack_padded_sequence(
	x_sorted, x_len_sorted.cpu(), batch_first=True
	)
	x_packed, (h, _) = self.output_LSTM(x_packed)

	x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
	x = x.index_select(dim=0, index=x_ori_idx)
	h = (
	h.permute(1, 0, 2)
	.contiguous()
	.view(-1, h.size(0) * h.size(2))
	.squeeze()
	)
	h = h.index_select(dim=0, index=x_ori_idx)

	m2 = x # m2 = self.output_LSTM((m, l))[0]
	# (batch, c_len)
	p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze()

	return p1, p2 # self.softmax(p1), self.softmax(p2)

	# 1. Character Embedding Layer
	# print('char_emb_layer - c')
	c_char = char_emb_layer(batch[0])
	# c_char = char_emb_layer(batch.context_char)
	# print(c_char.shape)
	# print('char_emb_layer - q')
	q_char = char_emb_layer(batch[3])
	# q_char = char_emb_layer(batch.question_char)
	# print(q_char.shape)

	# 2. Word Embedding Layer
	# print('word_emb_layer - c')
	c_word = self.word_emb(batch[1])
	# c_word = self.word_emb(batch.context_word[0])
	# print(c_word.shape)
	# print('word_emb_layer - q')
	q_word = self.word_emb(batch[4])
	# q_word = self.word_emb(batch.question_word[0])
	# print(q_word.shape)

	c_lens = batch[2]
	q_lens = batch[5]
	# c_lens = batch.context_word[1]
	# q_lens = batch.question_word[1]

	# Highway network (combine char- and world-level information)
	# print('highway_network - c')
	c = highway_network(c_char, c_word)
	# print(c.shape)
	# print('highway_network - q')
	q = highway_network(q_char, q_word)
	# print(q.shape)

	# 3. Contextual Embedding Layer
	# print('Contextual Embedding - c')

	x, x_len = c, c_lens

	x_len_sorted, x_idx = torch.sort(x_len, descending=True)
	x_sorted = x.index_select(dim=0, index=x_idx)
	_, x_ori_idx = torch.sort(x_idx)

	x_packed = nn.utils.rnn.pack_padded_sequence(
	x_sorted, x_len_sorted.cpu(), batch_first=True
	)
	x_packed, (h, _) = self.context_LSTM(x_packed)

	x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
	x = x.index_select(dim=0, index=x_ori_idx)
	h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
	h = h.index_select(dim=0, index=x_ori_idx)

	c = x # c = self.context_LSTM((c, c_lens))[0]
	# print(c.shape)

	# print('Contextual Embedding - q')

	x, x_len = q, q_lens
	x = self.dropout(x)

	x_len_sorted, x_idx = torch.sort(x_len, descending=True)
	x_sorted = x.index_select(dim=0, index=x_idx)
	_, x_ori_idx = torch.sort(x_idx)

	x_packed = nn.utils.rnn.pack_padded_sequence(
	x_sorted, x_len_sorted.cpu(), batch_first=True
	)
	x_packed, (h, _) = self.context_LSTM(x_packed)

	x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
	x = x.index_select(dim=0, index=x_ori_idx)
	h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
	h = h.index_select(dim=0, index=x_ori_idx)

	q = x # q = self.context_LSTM((q, q_lens))[0]
	# print(q.shape)

	# 4. Attention Flow Layer
	# print('Attention Flow - c and q')
	g = att_flow_layer(c, q)
	# print(g.shape)

	# 5. Modeling Layer
	# print('Modeling Layer - c and q')

	x, x_len = g, c_lens
	x = self.dropout(x)

	x_len_sorted, x_idx = torch.sort(x_len, descending=True)
	x_sorted = x.index_select(dim=0, index=x_idx)
	_, x_ori_idx = torch.sort(x_idx)

	x_packed = nn.utils.rnn.pack_padded_sequence(
	x_sorted, x_len_sorted.cpu(), batch_first=True
	)
	x_packed, (h, _) = self.modeling_LSTM1(x_packed)

	x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
	x = x.index_select(dim=0, index=x_ori_idx)
	h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
	h = h.index_select(dim=0, index=x_ori_idx)

	m = x

	x, x_len = m, c_lens
	x = self.dropout(x)

	x_len_sorted, x_idx = torch.sort(x_len, descending=True)
	x_sorted = x.index_select(dim=0, index=x_idx)
	_, x_ori_idx = torch.sort(x_idx)

	x_packed = nn.utils.rnn.pack_padded_sequence(
	x_sorted, x_len_sorted.cpu(), batch_first=True
	)
	x_packed, (h, _) = self.modeling_LSTM2(x_packed)

	x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
	x = x.index_select(dim=0, index=x_ori_idx)
	h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
	h = h.index_select(dim=0, index=x_ori_idx)

	m = x
	# print(m.shape)

	# 6. Output Layer
	# print('Output Layer')
	p1, p2 = output_layer(g, m, c_lens)

	# print(p1.shape)
	# print(p2.shape)

	# (batch, c_len), (batch, c_len)
	return p1, p2


	architecture = None
	perturbation = "MODEL"
	config = Config()


	char_size = torch.load("./content/char_size.pt")
	word_size = torch.load("./content/word_size.pt")
	embed_size = torch.load("./content/embed_size.pt")
	word_embed = torch.load("./content/word_embed.pt")

	model = BiDAF(
	config,
	char_size,
	word_size,
	embed_size,
	word_embed,
	private=(True if perturbation == "MODEL" else False),
	)

	from torch import optim

	# optimizer = optim.Adadelta(model.parameters(), lr=0.5)
	optimizer = optim.SGD(model.parameters(), lr=0.5)

	from opacus import PrivacyEngine

	privacy_engine = PrivacyEngine()


	# FAKE DATALOADER (Because make_private needs one)
	from torchtext.datasets import IMDB
	dataloader = DataLoader(IMDB(), batch_size=1)


	model, optimizer, dataloader = privacy_engine.make_private(
	model,
	optimizer,
	dataloader,
	noise_multiplier=config.model_noise,
	max_grad_norm=config.grad_clipping,
	)


	print(
	"Model Parameters (requires_grad: False): {}".format(
	sum(p.numel() for p in model.parameters())
	)
	)

	print(
	"Model Parameters (requires_grad: True): {}".format(
	sum(p.numel() for p in model.parameters() if p.requires_grad)
	)
	)

	batch = torch.load("./content/batch.pt")

	for el in batch:
	print(el.shape)

	p1, p2 = model(batch)

	criterion = nn.CrossEntropyLoss()
	loss = criterion(p1, batch[-2]) + criterion(p2, batch[-1])
	loss.backward()

	for n, p in model.named_parameters():
	print(
	"{:50s} {}".format(
	n, list(p.grad_sample.shape) if hasattr(p, "grad_sample") else None
	)
	)

	optimizer.step() # same for virtual step
	optimizer.zero_grad()