-
-
Save romovpa/cdbfae8f6b601b7e41472c2bf4f8a777 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
pip install --quiet transformers annoy opacus nltk torchtext spacy | |
""" | |
import json | |
import os | |
import nltk | |
import numpy as np | |
import pandas as pd | |
import spacy | |
nltk.download("punkt") | |
import torch | |
from annoy import AnnoyIndex | |
from torchtext import datasets | |
from torchtext.legacy import data | |
from torchtext.legacy.data import BucketIterator | |
from torchtext.vocab import GloVe, Vectors | |
from torchtext.vocab import vocab as Vocab | |
from tqdm.notebook import tqdm | |
from transformers import BertForQuestionAnswering, BertTokenizer | |
tqdm.pandas() | |
from torch.utils.data import ( | |
BatchSampler, | |
DataLoader, | |
Dataset, | |
RandomSampler, | |
SequentialSampler, | |
) | |
class Config(object): | |
embed_size = 300 | |
hidden_layers = 3 | |
input_dim = 1150 | |
hidden_dim = 575 | |
# hidden_size = 32 | |
bidirectional = True | |
output_size = 2 # 4 | |
max_epochs = 5 | |
tie_dims = False | |
batch_size = 64 | |
max_length = None | |
vocab_size = 50000 | |
pretrained_embedding = "glove.6B.100d" | |
freeze = True | |
embed_dp = 0.05 | |
weight_dp = 0.5 | |
locked_dpi = 0.4 | |
locked_dph = 0.3 | |
locked_dpo = 0.4 | |
lr = 1e-3 # 1e-4 | |
betas = (0.7, 0.99) # Deep Biaffine Attention for Neural Dependency Parsing | |
weight_decay = 0 | |
momentum = 0.9 # SGD only | |
# STEPLR | |
step_size = 1 | |
gamma = 0.5 | |
# Slanted Triangular Scheduling | |
num_cycles = 0.5 | |
num_training_steps = 25000 / batch_size * max_epochs | |
num_warmup_steps = num_training_steps * 0.1 | |
## PRIVACY (ε ≈ 8) | |
data_noise = 10 | |
model_noise = 0.45 | |
label_noise = 0.05 # 1 / epsilon | |
delta = 1e-5 # 1 / len(data) | |
rdp_alphas = [1 + x / 10.0 for x in range(1, 100)] + list(range(11, 100)) | |
grad_clipping = 1.0 | |
grad_accumulation = 1 | |
# PATE | |
query_dataset = 500 # [100, 500, 1000] | |
teacher_count = 2 # 100 # [10, 100, 250] | |
teacher_epochs = 1 # 5 | |
teacher_batch_size = 32 | |
teacher_intervall = 1 | |
student_count = 2 | |
student_epochs = 15 | |
student_batch_size = 32 | |
student_intervall = 1 | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from opacus.layers import DPLSTM as LSTM | |
class BiDAF(nn.Module): | |
def __init__( | |
self, config, char_size, word_size, embed_size, embedding, private=False | |
): | |
super(BiDAF, self).__init__() | |
self.config = config | |
# 1. Character Embedding Layer | |
self.char_emb = nn.Embedding(char_size, 8, padding_idx=1) | |
nn.init.uniform_(self.char_emb.weight, -0.001, 0.001) | |
self.char_conv = nn.Sequential(nn.Conv2d(1, 100, (8, 5)), nn.ReLU()) | |
# 2. Word Embedding Layer | |
# initialize word embedding with GloVe | |
self.word_emb = nn.Embedding(word_size, embed_size).from_pretrained( | |
embedding, freeze=self.config.freeze | |
) | |
# highway network | |
# assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim) | |
for i in range(2): | |
setattr( | |
self, | |
"highway_linear{}".format(i), | |
nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.ReLU()), | |
) | |
setattr( | |
self, | |
"highway_gate{}".format(i), | |
nn.Sequential(nn.Linear(100 * 2, 100 * 2), nn.Sigmoid()), | |
) | |
# 3. Contextual Embedding Layer | |
## add dropout | |
if private == True: | |
self.context_LSTM = LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
elif private == False: | |
self.context_LSTM = nn.LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
# 4. Attention Flow Layer | |
self.att_weight_c = nn.Linear(100 * 2, 1) | |
self.att_weight_q = nn.Linear(100 * 2, 1) | |
self.att_weight_cq = nn.Linear(100 * 2, 1) | |
# 5. Modeling Layer | |
if private == True: | |
self.modeling_LSTM1 = LSTM( | |
input_size=100 * 8, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
self.modeling_LSTM2 = LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
elif private == False: | |
self.modeling_LSTM1 = nn.LSTM( | |
input_size=100 * 8, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
self.modeling_LSTM2 = nn.LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
# 6. Output Layer | |
self.p1_weight_g = nn.Linear(100 * 8, 1) | |
self.p1_weight_m = nn.Linear(100 * 2, 1) | |
self.p2_weight_g = nn.Linear(100 * 8, 1) | |
self.p2_weight_m = nn.Linear(100 * 2, 1) | |
if private == True: | |
self.output_LSTM = LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
elif private == False: | |
self.output_LSTM = nn.LSTM( | |
input_size=100 * 2, | |
hidden_size=100, | |
bidirectional=True, | |
batch_first=True, | |
) | |
self.dropout = nn.Dropout(p=0.2) | |
self.softmax = nn.Softmax(dim=1) | |
def forward(self, batch): | |
def char_emb_layer(x): | |
""" | |
:param x: (batch, seq_len, word_len) | |
:return: (batch, seq_len, char_channel_size) | |
""" | |
batch_size = x.size(0) | |
# (batch, seq_len, word_len, char_dim) | |
x = self.dropout(self.char_emb(x)) | |
# (batch, seq_len, char_dim, word_len) | |
x = x.transpose(2, 3) | |
# (batch * seq_len, 1, char_dim, word_len) | |
x = x.view(-1, 8, x.size(3)).unsqueeze(1) | |
# (batch * seq_len, char_channel_size, 1, conv_len) -> (batch * seq_len, char_channel_size, conv_len) | |
x = self.char_conv(x).squeeze() | |
# (batch * seq_len, char_channel_size, 1) -> (batch * seq_len, char_channel_size) | |
x = F.max_pool1d(x, x.size(2)).squeeze() | |
# (batch, seq_len, char_channel_size) | |
x = x.view(batch_size, -1, 100) | |
return x | |
def highway_network(x1, x2): | |
""" | |
:param x1: (batch, seq_len, char_channel_size) | |
:param x2: (batch, seq_len, word_dim) | |
:return: (batch, seq_len, hidden_size * 2) | |
""" | |
# (batch, seq_len, char_channel_size + word_dim) | |
x = torch.cat([x1, x2], dim=-1) | |
for i in range(2): | |
h = getattr(self, "highway_linear{}".format(i))(x) | |
g = getattr(self, "highway_gate{}".format(i))(x) | |
x = g * h + (1 - g) * x | |
# (batch, seq_len, hidden_size * 2) | |
return x | |
def att_flow_layer(c, q): | |
""" | |
:param c: (batch, c_len, hidden_size * 2) | |
:param q: (batch, q_len, hidden_size * 2) | |
:return: (batch, c_len, q_len) | |
""" | |
c_len = c.size(1) | |
q_len = q.size(1) | |
# (batch, c_len, q_len, hidden_size * 2) | |
# c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) | |
# (batch, c_len, q_len, hidden_size * 2) | |
# q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1) | |
# (batch, c_len, q_len, hidden_size * 2) | |
# cq_tiled = c_tiled * q_tiled | |
# cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1) | |
cq = [] | |
for i in range(q_len): | |
# (batch, 1, hidden_size * 2) | |
qi = q.select(1, i).unsqueeze(1) | |
# (batch, c_len, 1) | |
ci = self.att_weight_cq(c * qi).squeeze() | |
cq.append(ci) | |
# (batch, c_len, q_len) | |
cq = torch.stack(cq, dim=-1) | |
# (batch, c_len, q_len) | |
s = ( | |
self.att_weight_c(c).expand(-1, -1, q_len) | |
+ self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1) | |
+ cq | |
) | |
# (batch, c_len, q_len) | |
a = F.softmax(s, dim=2) | |
# (batch, c_len, q_len) * (batch, q_len, hidden_size * 2) -> (batch, c_len, hidden_size * 2) | |
c2q_att = torch.bmm(a, q) | |
# (batch, 1, c_len) | |
b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1) | |
# (batch, 1, c_len) * (batch, c_len, hidden_size * 2) -> (batch, hidden_size * 2) | |
q2c_att = torch.bmm(b, c).squeeze() | |
# (batch, c_len, hidden_size * 2) (tiled) | |
q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1) | |
# q2c_att = torch.stack([q2c_att] * c_len, dim=1) | |
# (batch, c_len, hidden_size * 8) | |
x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1) | |
return x | |
def output_layer(g, m, l): | |
""" | |
:param g: (batch, c_len, hidden_size * 8) | |
:param m: (batch, c_len ,hidden_size * 2) | |
:return: p1: (batch, c_len), p2: (batch, c_len) | |
""" | |
# (batch, c_len) | |
p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze() | |
# (batch, c_len, hidden_size * 2) | |
x, x_len = m, l | |
x = self.dropout(x) | |
x_len_sorted, x_idx = torch.sort(x_len, descending=True) | |
x_sorted = x.index_select(dim=0, index=x_idx) | |
_, x_ori_idx = torch.sort(x_idx) | |
x_packed = nn.utils.rnn.pack_padded_sequence( | |
x_sorted, x_len_sorted.cpu(), batch_first=True | |
) | |
x_packed, (h, _) = self.output_LSTM(x_packed) | |
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0] | |
x = x.index_select(dim=0, index=x_ori_idx) | |
h = ( | |
h.permute(1, 0, 2) | |
.contiguous() | |
.view(-1, h.size(0) * h.size(2)) | |
.squeeze() | |
) | |
h = h.index_select(dim=0, index=x_ori_idx) | |
m2 = x # m2 = self.output_LSTM((m, l))[0] | |
# (batch, c_len) | |
p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze() | |
return p1, p2 # self.softmax(p1), self.softmax(p2) | |
# 1. Character Embedding Layer | |
# print('char_emb_layer - c') | |
c_char = char_emb_layer(batch[0]) | |
# c_char = char_emb_layer(batch.context_char) | |
# print(c_char.shape) | |
# print('char_emb_layer - q') | |
q_char = char_emb_layer(batch[3]) | |
# q_char = char_emb_layer(batch.question_char) | |
# print(q_char.shape) | |
# 2. Word Embedding Layer | |
# print('word_emb_layer - c') | |
c_word = self.word_emb(batch[1]) | |
# c_word = self.word_emb(batch.context_word[0]) | |
# print(c_word.shape) | |
# print('word_emb_layer - q') | |
q_word = self.word_emb(batch[4]) | |
# q_word = self.word_emb(batch.question_word[0]) | |
# print(q_word.shape) | |
c_lens = batch[2] | |
q_lens = batch[5] | |
# c_lens = batch.context_word[1] | |
# q_lens = batch.question_word[1] | |
# Highway network (combine char- and world-level information) | |
# print('highway_network - c') | |
c = highway_network(c_char, c_word) | |
# print(c.shape) | |
# print('highway_network - q') | |
q = highway_network(q_char, q_word) | |
# print(q.shape) | |
# 3. Contextual Embedding Layer | |
# print('Contextual Embedding - c') | |
x, x_len = c, c_lens | |
x_len_sorted, x_idx = torch.sort(x_len, descending=True) | |
x_sorted = x.index_select(dim=0, index=x_idx) | |
_, x_ori_idx = torch.sort(x_idx) | |
x_packed = nn.utils.rnn.pack_padded_sequence( | |
x_sorted, x_len_sorted.cpu(), batch_first=True | |
) | |
x_packed, (h, _) = self.context_LSTM(x_packed) | |
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0] | |
x = x.index_select(dim=0, index=x_ori_idx) | |
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze() | |
h = h.index_select(dim=0, index=x_ori_idx) | |
c = x # c = self.context_LSTM((c, c_lens))[0] | |
# print(c.shape) | |
# print('Contextual Embedding - q') | |
x, x_len = q, q_lens | |
x = self.dropout(x) | |
x_len_sorted, x_idx = torch.sort(x_len, descending=True) | |
x_sorted = x.index_select(dim=0, index=x_idx) | |
_, x_ori_idx = torch.sort(x_idx) | |
x_packed = nn.utils.rnn.pack_padded_sequence( | |
x_sorted, x_len_sorted.cpu(), batch_first=True | |
) | |
x_packed, (h, _) = self.context_LSTM(x_packed) | |
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0] | |
x = x.index_select(dim=0, index=x_ori_idx) | |
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze() | |
h = h.index_select(dim=0, index=x_ori_idx) | |
q = x # q = self.context_LSTM((q, q_lens))[0] | |
# print(q.shape) | |
# 4. Attention Flow Layer | |
# print('Attention Flow - c and q') | |
g = att_flow_layer(c, q) | |
# print(g.shape) | |
# 5. Modeling Layer | |
# print('Modeling Layer - c and q') | |
x, x_len = g, c_lens | |
x = self.dropout(x) | |
x_len_sorted, x_idx = torch.sort(x_len, descending=True) | |
x_sorted = x.index_select(dim=0, index=x_idx) | |
_, x_ori_idx = torch.sort(x_idx) | |
x_packed = nn.utils.rnn.pack_padded_sequence( | |
x_sorted, x_len_sorted.cpu(), batch_first=True | |
) | |
x_packed, (h, _) = self.modeling_LSTM1(x_packed) | |
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0] | |
x = x.index_select(dim=0, index=x_ori_idx) | |
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze() | |
h = h.index_select(dim=0, index=x_ori_idx) | |
m = x | |
x, x_len = m, c_lens | |
x = self.dropout(x) | |
x_len_sorted, x_idx = torch.sort(x_len, descending=True) | |
x_sorted = x.index_select(dim=0, index=x_idx) | |
_, x_ori_idx = torch.sort(x_idx) | |
x_packed = nn.utils.rnn.pack_padded_sequence( | |
x_sorted, x_len_sorted.cpu(), batch_first=True | |
) | |
x_packed, (h, _) = self.modeling_LSTM2(x_packed) | |
x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0] | |
x = x.index_select(dim=0, index=x_ori_idx) | |
h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze() | |
h = h.index_select(dim=0, index=x_ori_idx) | |
m = x | |
# print(m.shape) | |
# 6. Output Layer | |
# print('Output Layer') | |
p1, p2 = output_layer(g, m, c_lens) | |
# print(p1.shape) | |
# print(p2.shape) | |
# (batch, c_len), (batch, c_len) | |
return p1, p2 | |
architecture = None | |
perturbation = "MODEL" | |
config = Config() | |
char_size = torch.load("./content/char_size.pt") | |
word_size = torch.load("./content/word_size.pt") | |
embed_size = torch.load("./content/embed_size.pt") | |
word_embed = torch.load("./content/word_embed.pt") | |
model = BiDAF( | |
config, | |
char_size, | |
word_size, | |
embed_size, | |
word_embed, | |
private=(True if perturbation == "MODEL" else False), | |
) | |
from torch import optim | |
# optimizer = optim.Adadelta(model.parameters(), lr=0.5) | |
optimizer = optim.SGD(model.parameters(), lr=0.5) | |
from opacus import PrivacyEngine | |
privacy_engine = PrivacyEngine() | |
# FAKE DATALOADER (Because make_private needs one) | |
from torchtext.datasets import IMDB | |
dataloader = DataLoader(IMDB(), batch_size=1) | |
model, optimizer, dataloader = privacy_engine.make_private( | |
model, | |
optimizer, | |
dataloader, | |
noise_multiplier=config.model_noise, | |
max_grad_norm=config.grad_clipping, | |
) | |
print( | |
"Model Parameters (requires_grad: False): {}".format( | |
sum(p.numel() for p in model.parameters()) | |
) | |
) | |
print( | |
"Model Parameters (requires_grad: True): {}".format( | |
sum(p.numel() for p in model.parameters() if p.requires_grad) | |
) | |
) | |
batch = torch.load("./content/batch.pt") | |
for el in batch: | |
print(el.shape) | |
p1, p2 = model(batch) | |
criterion = nn.CrossEntropyLoss() | |
loss = criterion(p1, batch[-2]) + criterion(p2, batch[-1]) | |
loss.backward() | |
for n, p in model.named_parameters(): | |
print( | |
"{:50s} {}".format( | |
n, list(p.grad_sample.shape) if hasattr(p, "grad_sample") else None | |
) | |
) | |
optimizer.step() # same for virtual step | |
optimizer.zero_grad() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment