Skip to content

Instantly share code, notes, and snippets.

@romovpa
Created November 11, 2021 17:14
Show Gist options
  • Save romovpa/cdbfae8f6b601b7e41472c2bf4f8a777 to your computer and use it in GitHub Desktop.
Save romovpa/cdbfae8f6b601b7e41472c2bf4f8a777 to your computer and use it in GitHub Desktop.
"""
pip install --quiet transformers annoy opacus nltk torchtext spacy
"""
import json
import os
import nltk
import numpy as np
import pandas as pd
import spacy
nltk.download("punkt")
import torch
from annoy import AnnoyIndex
from torchtext import datasets
from torchtext.legacy import data
from torchtext.legacy.data import BucketIterator
from torchtext.vocab import GloVe, Vectors
from torchtext.vocab import vocab as Vocab
from tqdm.notebook import tqdm
from transformers import BertForQuestionAnswering, BertTokenizer
tqdm.pandas()
from torch.utils.data import (
BatchSampler,
DataLoader,
Dataset,
RandomSampler,
SequentialSampler,
)
class Config(object):
embed_size = 300
hidden_layers = 3
input_dim = 1150
hidden_dim = 575
# hidden_size = 32
bidirectional = True
output_size = 2 # 4
max_epochs = 5
tie_dims = False
batch_size = 64
max_length = None
vocab_size = 50000
pretrained_embedding = "glove.6B.100d"
freeze = True
embed_dp = 0.05
weight_dp = 0.5
locked_dpi = 0.4
locked_dph = 0.3
locked_dpo = 0.4
lr = 1e-3 # 1e-4
betas = (0.7, 0.99) # Deep Biaffine Attention for Neural Dependency Parsing
weight_decay = 0
momentum = 0.9 # SGD only
# STEPLR
step_size = 1
gamma = 0.5
# Slanted Triangular Scheduling
num_cycles = 0.5
num_training_steps = 25000 / batch_size * max_epochs
num_warmup_steps = num_training_steps * 0.1
## PRIVACY (ε ≈ 8)
data_noise = 10
model_noise = 0.45
label_noise = 0.05 # 1 / epsilon
delta = 1e-5 # 1 / len(data)
rdp_alphas = [1 + x / 10.0 for x in range(1, 100)] + list(range(11, 100))
grad_clipping = 1.0
grad_accumulation = 1
# PATE
query_dataset = 500 # [100, 500, 1000]
teacher_count = 2 # 100 # [10, 100, 250]
teacher_epochs = 1 # 5
teacher_batch_size = 32
teacher_intervall = 1
student_count = 2
student_epochs = 15
student_batch_size = 32
student_intervall = 1
import torch
import torch.nn as nn
import torch.nn.functional as F
from opacus.layers import DPLSTM as LSTM
class BiDAF(nn.Module):
def __init__(
self, config, char_size, word_size, embed_size, embedding, private=False
):
super(BiDAF, self).__init__()
self.config = config
# 1. Character Embedding Layer
self.char_emb = nn.Embedding(char_size, 8, padding_idx=1)
nn.init.uniform_(self.char_emb.weight, -0.001, 0.001)
self.char_conv = nn.Sequential(nn.Conv2d(1, 100, (8, 5)), nn.ReLU())
# 2. Word Embedding Layer
# initialize word embedding with GloVe
self.word_emb = nn.Embedding(word_size, embed_size).from_pretrained(
embedding, freeze=self.config.freeze
)
# highway network
# assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim)
for i in range(2):
setattr(
self,
"highway_linear{}".format(i),
nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.ReLU()),
)
setattr(
self,
"highway_gate{}".format(i),
nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.Sigmoid()),
)
# 3. Contextual Embedding Layer
## add dropout
if private == True:
self.context_LSTM = LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
elif private == False:
self.context_LSTM = nn.LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
# 4. Attention Flow Layer
self.att_weight_c = nn.Linear(100 * 2, 1)
self.att_weight_q = nn.Linear(100 * 2, 1)
self.att_weight_cq = nn.Linear(100 * 2, 1)
# 5. Modeling Layer
if private == True:
self.modeling_LSTM1 = LSTM(
input_size=100 * 8,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
self.modeling_LSTM2 = LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
elif private == False:
self.modeling_LSTM1 = nn.LSTM(
input_size=100 * 8,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
self.modeling_LSTM2 = nn.LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
# 6. Output Layer
self.p1_weight_g = nn.Linear(100 * 8, 1)
self.p1_weight_m = nn.Linear(100 * 2, 1)
self.p2_weight_g = nn.Linear(100 * 8, 1)
self.p2_weight_m = nn.Linear(100 * 2, 1)
if private == True:
self.output_LSTM = LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
elif private == False:
self.output_LSTM = nn.LSTM(
input_size=100 * 2,
hidden_size=100,
bidirectional=True,
batch_first=True,
)
self.dropout = nn.Dropout(p=0.2)
self.softmax = nn.Softmax(dim=1)
def forward(self, batch):
def char_emb_layer(x):
"""
:param x: (batch, seq_len, word_len)
:return: (batch, seq_len, char_channel_size)
"""
batch_size = x.size(0)
# (batch, seq_len, word_len, char_dim)
x = self.dropout(self.char_emb(x))
# (batch, seq_len, char_dim, word_len)
x = x.transpose(2, 3)
# (batch * seq_len, 1, char_dim, word_len)
x = x.view(-1, 8, x.size(3)).unsqueeze(1)
# (batch * seq_len, char_channel_size, 1, conv_len) -> (batch * seq_len, char_channel_size, conv_len)
x = self.char_conv(x).squeeze()
# (batch * seq_len, char_channel_size, 1) -> (batch * seq_len, char_channel_size)
x = F.max_pool1d(x, x.size(2)).squeeze()
# (batch, seq_len, char_channel_size)
x = x.view(batch_size, -1, 100)
return x
def highway_network(x1, x2):
"""
:param x1: (batch, seq_len, char_channel_size)
:param x2: (batch, seq_len, word_dim)
:return: (batch, seq_len, hidden_size * 2)
"""
# (batch, seq_len, char_channel_size + word_dim)
x = torch.cat([x1, x2], dim=-1)
for i in range(2):
h = getattr(self, "highway_linear{}".format(i))(x)
g = getattr(self, "highway_gate{}".format(i))(x)
x = g * h + (1 - g) * x
# (batch, seq_len, hidden_size * 2)
return x
def att_flow_layer(c, q):
"""
:param c: (batch, c_len, hidden_size * 2)
:param q: (batch, q_len, hidden_size * 2)
:return: (batch, c_len, q_len)
"""
c_len = c.size(1)
q_len = q.size(1)
# (batch, c_len, q_len, hidden_size * 2)
# c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1)
# (batch, c_len, q_len, hidden_size * 2)
# q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1)
# (batch, c_len, q_len, hidden_size * 2)
# cq_tiled = c_tiled * q_tiled
# cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1)
cq = []
for i in range(q_len):
# (batch, 1, hidden_size * 2)
qi = q.select(1, i).unsqueeze(1)
# (batch, c_len, 1)
ci = self.att_weight_cq(c * qi).squeeze()
cq.append(ci)
# (batch, c_len, q_len)
cq = torch.stack(cq, dim=-1)
# (batch, c_len, q_len)
s = (
self.att_weight_c(c).expand(-1, -1, q_len)
+ self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1)
+ cq
)
# (batch, c_len, q_len)
a = F.softmax(s, dim=2)
# (batch, c_len, q_len) * (batch, q_len, hidden_size * 2) -> (batch, c_len, hidden_size * 2)
c2q_att = torch.bmm(a, q)
# (batch, 1, c_len)
b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
# (batch, 1, c_len) * (batch, c_len, hidden_size * 2) -> (batch, hidden_size * 2)
q2c_att = torch.bmm(b, c).squeeze()
# (batch, c_len, hidden_size * 2) (tiled)
q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1)
# q2c_att = torch.stack([q2c_att] * c_len, dim=1)
# (batch, c_len, hidden_size * 8)
x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1)
return x
def output_layer(g, m, l):
"""
:param g: (batch, c_len, hidden_size * 8)
:param m: (batch, c_len ,hidden_size * 2)
:return: p1: (batch, c_len), p2: (batch, c_len)
"""
# (batch, c_len)
p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze()
# (batch, c_len, hidden_size * 2)
x, x_len = m, l
x = self.dropout(x)
x_len_sorted, x_idx = torch.sort(x_len, descending=True)
x_sorted = x.index_select(dim=0, index=x_idx)
_, x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(
x_sorted, x_len_sorted.cpu(), batch_first=True
)
x_packed, (h, _) = self.output_LSTM(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
x = x.index_select(dim=0, index=x_ori_idx)
h = (
h.permute(1, 0, 2)
.contiguous()
.view(-1, h.size(0) * h.size(2))
.squeeze()
)
h = h.index_select(dim=0, index=x_ori_idx)
m2 = x # m2 = self.output_LSTM((m, l))[0]
# (batch, c_len)
p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze()
return p1, p2 # self.softmax(p1), self.softmax(p2)
# 1. Character Embedding Layer
# print('char_emb_layer - c')
c_char = char_emb_layer(batch[0])
# c_char = char_emb_layer(batch.context_char)
# print(c_char.shape)
# print('char_emb_layer - q')
q_char = char_emb_layer(batch[3])
# q_char = char_emb_layer(batch.question_char)
# print(q_char.shape)
# 2. Word Embedding Layer
# print('word_emb_layer - c')
c_word = self.word_emb(batch[1])
# c_word = self.word_emb(batch.context_word[0])
# print(c_word.shape)
# print('word_emb_layer - q')
q_word = self.word_emb(batch[4])
# q_word = self.word_emb(batch.question_word[0])
# print(q_word.shape)
c_lens = batch[2]
q_lens = batch[5]
# c_lens = batch.context_word[1]
# q_lens = batch.question_word[1]
# Highway network (combine char- and world-level information)
# print('highway_network - c')
c = highway_network(c_char, c_word)
# print(c.shape)
# print('highway_network - q')
q = highway_network(q_char, q_word)
# print(q.shape)
# 3. Contextual Embedding Layer
# print('Contextual Embedding - c')
x, x_len = c, c_lens
x_len_sorted, x_idx = torch.sort(x_len, descending=True)
x_sorted = x.index_select(dim=0, index=x_idx)
_, x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(
x_sorted, x_len_sorted.cpu(), batch_first=True
)
x_packed, (h, _) = self.context_LSTM(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
x = x.index_select(dim=0, index=x_ori_idx)
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
h = h.index_select(dim=0, index=x_ori_idx)
c = x # c = self.context_LSTM((c, c_lens))[0]
# print(c.shape)
# print('Contextual Embedding - q')
x, x_len = q, q_lens
x = self.dropout(x)
x_len_sorted, x_idx = torch.sort(x_len, descending=True)
x_sorted = x.index_select(dim=0, index=x_idx)
_, x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(
x_sorted, x_len_sorted.cpu(), batch_first=True
)
x_packed, (h, _) = self.context_LSTM(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
x = x.index_select(dim=0, index=x_ori_idx)
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
h = h.index_select(dim=0, index=x_ori_idx)
q = x # q = self.context_LSTM((q, q_lens))[0]
# print(q.shape)
# 4. Attention Flow Layer
# print('Attention Flow - c and q')
g = att_flow_layer(c, q)
# print(g.shape)
# 5. Modeling Layer
# print('Modeling Layer - c and q')
x, x_len = g, c_lens
x = self.dropout(x)
x_len_sorted, x_idx = torch.sort(x_len, descending=True)
x_sorted = x.index_select(dim=0, index=x_idx)
_, x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(
x_sorted, x_len_sorted.cpu(), batch_first=True
)
x_packed, (h, _) = self.modeling_LSTM1(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
x = x.index_select(dim=0, index=x_ori_idx)
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
h = h.index_select(dim=0, index=x_ori_idx)
m = x
x, x_len = m, c_lens
x = self.dropout(x)
x_len_sorted, x_idx = torch.sort(x_len, descending=True)
x_sorted = x.index_select(dim=0, index=x_idx)
_, x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(
x_sorted, x_len_sorted.cpu(), batch_first=True
)
x_packed, (h, _) = self.modeling_LSTM2(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
x = x.index_select(dim=0, index=x_ori_idx)
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
h = h.index_select(dim=0, index=x_ori_idx)
m = x
# print(m.shape)
# 6. Output Layer
# print('Output Layer')
p1, p2 = output_layer(g, m, c_lens)
# print(p1.shape)
# print(p2.shape)
# (batch, c_len), (batch, c_len)
return p1, p2
architecture = None
perturbation = "MODEL"
config = Config()
char_size = torch.load("./content/char_size.pt")
word_size = torch.load("./content/word_size.pt")
embed_size = torch.load("./content/embed_size.pt")
word_embed = torch.load("./content/word_embed.pt")
model = BiDAF(
config,
char_size,
word_size,
embed_size,
word_embed,
private=(True if perturbation == "MODEL" else False),
)
from torch import optim
# optimizer = optim.Adadelta(model.parameters(), lr=0.5)
optimizer = optim.SGD(model.parameters(), lr=0.5)
from opacus import PrivacyEngine
privacy_engine = PrivacyEngine()
# FAKE DATALOADER (Because make_private needs one)
from torchtext.datasets import IMDB
dataloader = DataLoader(IMDB(), batch_size=1)
model, optimizer, dataloader = privacy_engine.make_private(
model,
optimizer,
dataloader,
noise_multiplier=config.model_noise,
max_grad_norm=config.grad_clipping,
)
print(
"Model Parameters (requires_grad: False): {}".format(
sum(p.numel() for p in model.parameters())
)
)
print(
"Model Parameters (requires_grad: True): {}".format(
sum(p.numel() for p in model.parameters() if p.requires_grad)
)
)
batch = torch.load("./content/batch.pt")
for el in batch:
print(el.shape)
p1, p2 = model(batch)
criterion = nn.CrossEntropyLoss()
loss = criterion(p1, batch[-2]) + criterion(p2, batch[-1])
loss.backward()
for n, p in model.named_parameters():
print(
"{:50s} {}".format(
n, list(p.grad_sample.shape) if hasattr(p, "grad_sample") else None
)
)
optimizer.step() # same for virtual step
optimizer.zero_grad()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment