Last active
November 30, 2021 18:30
-
-
Save hc2116/b7bd37d76b892938d32521b484817e2c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# RNN definition | |
class LSTM_deep(nn.Module): | |
def __init__(self, n_categories, seq_input_size,seq_size_input_size, batch_size, hidden_size): | |
super(LSTM_deep, self).__init__() | |
dropout = 0.2 | |
self.num_layers=num_layers=2 | |
self.bidirectional = bidirectional = False | |
embed_dim1=10 | |
embed_dim2=5 | |
embed_dim3=3 | |
self.hidden_size = hidden_size | |
self.hidden_size2 = hidden_size2 = hidden_size | |
self.cn_size = hidden_size | |
#self.batch_size = batch_size | |
self.seq_input_size = seq_input_size | |
self.n_categories = n_categories | |
############################ | |
# would be better to use nn.embed | |
self.embed_state=nn.Linear(seq_input_size, embed_dim1) | |
self.embed_size=nn.Linear(seq_size_input_size, embed_dim2) | |
self.embed_cat=nn.Linear(n_categories, embed_dim3) | |
############################ | |
self.i2h = nn.LSTM(input_size=embed_dim1 + embed_dim2 + embed_dim3, | |
hidden_size=hidden_size, | |
num_layers=num_layers, dropout=dropout, bidirectional=bidirectional) | |
self.o2o = nn.Sequential(nn.Linear(hidden_size, hidden_size2), nn.ReLU(inplace=True)) | |
self.o2o2 = nn.Linear(hidden_size2, seq_input_size) | |
self.size_o2o2 = nn.Linear(hidden_size2, seq_size_input_size) | |
self.dropout = nn.Dropout(dropout) | |
self.softmax = nn.LogSoftmax() | |
def forward(self, category, seq_input,seq_size_input, hidden, cn, new_batch_size): | |
seq_input_combined_LSTM = torch.cat((self.embed_cat(category.view(new_batch_size,1,self.n_categories)), | |
self.embed_state(seq_input), | |
self.embed_size(seq_size_input)),2).transpose(0,1) | |
outputLSTM, (hidden, cn) = self.i2h(seq_input_combined_LSTM, (hidden, cn)) | |
outputLSTM=outputLSTM.view(new_batch_size,self.hidden_size) | |
self.dropout(outputLSTM) | |
output = self.o2o(outputLSTM) | |
state_output = self.o2o2(output) | |
size_output = self.o2o2(output) | |
state_output = self.softmax(state_output) | |
size_output = self.softmax(size_output) | |
return output,size_output, hidden, cn | |
def initHidden(self, new_batch_size): | |
return Variable(torch.zeros(self.num_layers, new_batch_size, self.hidden_size)) | |
def initcn(self, new_batch_size): | |
return Variable(torch.zeros(self.num_layers, new_batch_size, self.cn_size)) | |
def train_epoch(train_loader, model, criterion, optimizer, epoch, normal): | |
""" | |
Train for one epoch | |
""" | |
model.train() | |
print('inside train') | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(train_loader): | |
if (i<10): | |
print('i='+str(i)) | |
if (i<100)&(i%10==0): | |
print('i='+str(i)) | |
if (i<1000)&(i%100==0): | |
print('i='+str(i)) | |
if i%1000==0: | |
print('i='+str(i)) | |
# we will need to convert them to Variable to use gradients. | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
############## LSTM addition ############## | |
cn = model.initcn(new_batch_size) | |
############################################# | |
model.zero_grad() | |
loss = 0 | |
for flow in range(seq_input.size()[0]): | |
############## LSTM addition ############## | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
############################################# | |
#loss+=criterion(state_output, target[flow]) | |
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow]) | |
loss.backward() | |
optimizer.step() | |
def validate_epoch(val_loader, model, criterion, epoch, normal): | |
losses = AverageMeter() | |
# switch to evaluation mode so that the Dropout doesn't drop neurons | |
model.eval() | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(val_loader): | |
if (i<10): | |
print('i={}'.format(i)) | |
if (i<100)&(i%10==0): | |
print('i={}'.format(i)) | |
if (i<1000)&(i%100==0): | |
print('i={}'.format(i)) | |
if i%1000==0: | |
print('i={}'.format(i)) | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
############## LSTM addition ############## | |
cn = model.initcn(new_batch_size) | |
############################################# | |
model.zero_grad() | |
loss = 0 | |
for flow in range(seq_input.size()[0]): | |
############## LSTM addition ############## | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
############################################# | |
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow]) | |
print(loss.item()) | |
losses.update(loss.item(), seq_input.size()[0]) | |
with open('validate_log.txt', 'a') as f: | |
f.write('Epoch: [{0}][{1}/{2}]\t' | |
'loss: {loss}\n'.format(epoch, i, len(val_loader),loss=loss.item())) | |
return(losses.avg) | |
def predict(df,df_seq_input,categories,alphabet,size_alphabet,dataname,n_hidden=200): | |
""" | |
This function loads the trained rnn and returns prediction scores for normal hosts and infected hosts in test set | |
It also returns the cumulative averages of the scores | |
""" | |
print('prediction starts') | |
n_categories = len(categories) | |
batch_size = 1 | |
#n_hidden=200 | |
global alphabett | |
alphabett = alphabet | |
global size_alphabett | |
size_alphabett = size_alphabet | |
checkpoint = torch.load(PATH_MODEL+dataname+'_normal_model_best_long.pth.tar') | |
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet), | |
batch_size=batch_size, hidden_size=n_hidden) | |
model.load_state_dict(checkpoint['state_dict']) | |
pred_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==3], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
model.eval() | |
output_stats = {'SrcAddr':[], 'session_id':[], 'mean':[], 'median':[], | |
'probs':[],'size_probs':[], 'sesslen':[], 'probsmax':[], 'Ports':[], 'Portsmax':[], | |
'size_probsmax':[], 'Sizes':[], 'Sizemax':[], | |
'meanmax':[], 'medianmax':[]} | |
row_indicator=[x for x in range(batch_size)] | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(pred_loader): | |
if (i<10): | |
print('i={}'.format(i)) | |
if (i<100)&(i%10==0): | |
print('i={}'.format(i)) | |
if (i<1000)&(i%100==0): | |
print('i={}'.format(i)) | |
if i%1000==0: | |
print('i={}'.format(i)) | |
probs = [] | |
probsmax = [] | |
Ports = [] | |
Portsmax = ['-'] | |
#################################################################### | |
size_probs = [] | |
size_probsmax = [] | |
Sizes = [] | |
Sizesmax = ['-'] | |
#################################################################### | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
cn = model.initcn(new_batch_size) | |
for flow in range(seq_input.size()[0]): | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
value, index = seq_input[flow].squeeze().max(0) | |
size_value, size_index = seq_size_input[flow].squeeze().max(0) | |
Port=alphabett[index.item()] | |
prob_next_flow = np.array(list(state_output.data[row_indicator,target.data.numpy()[flow]])) | |
value, index = state_output.data[row_indicator,].squeeze().max(0) | |
probmax = np.exp(value.item()) | |
#print(index.item()) | |
#print(len(alphabett)) | |
Portmax = alphabett[index.item()] | |
######################################################################################## | |
Size=size_alphabett[size_index.item()] | |
size_prob_next_flow = np.array(list(size_output.data[row_indicator,size_target.data.numpy()[flow]])) | |
size_value, size_index = state_output.data[row_indicator,].squeeze().max(0) | |
size_probmax = np.exp(size_value.item()) | |
Sizemax = size_alphabett[size_index.item()] | |
######################################################################################## | |
probs.extend(list(1-np.exp(prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax | |
probsmax.append(1-size_probmax) | |
Ports.append(Port) | |
Portsmax.append(Portmax) | |
######################################################################################## | |
size_probs.extend(list(1-np.exp(size_prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax | |
size_probsmax.append(1-probmax) | |
Sizes.append(Size) | |
Sizesmax.append([Sizemax]) | |
Ports.append('EOS') | |
output_stats['mean'].append(np.mean(np.array(probs))) | |
output_stats['median'].append(np.median(np.array(probs))) | |
output_stats['SrcAddr'].append(SrcAddr[0]) | |
output_stats['session_id'].append(session_id[0]) | |
output_stats['probs'].append(str(probs)) | |
output_stats['probsmax'].append(str(probsmax)) | |
output_stats['size_probs'].append(str(size_probs)) | |
output_stats['size_probsmax'].append(str(size_probmax)) | |
output_stats['meanmax'].append(np.mean(np.array(probsmax))) | |
output_stats['medianmax'].append(np.median(np.array(probsmax))) | |
output_stats['Ports'].append(str(Ports)) | |
output_stats['Sizes'].append(str(Sizes)) | |
output_stats['Portsmax'].append(str(Portsmax)) | |
output_stats['Sizemax'].append(str(Sizesmax)) | |
output_stats['sesslen'].append(seq_input.size()[0]) | |
df_output = pd.DataFrame(data=output_stats) | |
df_output = df_output.sort_values(['SrcAddr', 'session_id']) | |
# for each session, we will also check the previous session to compute cumulative average scores | |
df_output2 = df_output.assign(sum_means = df_output.groupby('SrcAddr')['mean'].cumsum()) | |
df_output2 = df_output2.assign(count_previous = df_output2.groupby('SrcAddr')['mean'].cumcount()+1) | |
df_output2 = df_output2.assign(sum_medians = df_output2.groupby('SrcAddr')['median'].cumsum()) | |
df_output2 = df_output2.assign(avg_previous_means = lambda x: x['sum_means']/x['count_previous']) | |
df_output2 = df_output2.assign(avg_previous_medians = lambda x: x['sum_medians']/x['count_previous']) | |
df_output2.to_csv(dataname+'_output_test.csv', index=False) | |
return df_output2 | |
def train(df_seq_input,categories,alphabet,size_alphabet,dataname,epochs=250, base_lr=0.0003, n_hidden=200,batch_size=25): | |
# training hyperparameters. These will probably need to be tuned. | |
global alphabett | |
alphabett = alphabet | |
global size_alphabett | |
size_alphabett=size_alphabet | |
print('Starting main') | |
losses=[] | |
lr = base_lr | |
#n_hidden=200 | |
weight_decay = 5e-4 | |
momentum = 0.9 | |
lr_freq_adj = 50 | |
normal=True | |
n_categories = len(categories) | |
best_loss = 1000000 | |
#in_log.txt we define model | |
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet), | |
batch_size=batch_size, hidden_size=n_hidden) | |
# we define loss function | |
criterion = nn.NLLLoss() # we define Negatve Loglikelihood lost because We are doing a multinomial classification. | |
train_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==1], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
# test loader | |
test_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==2], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
# optimizer: Stochastic Gradient Descent | |
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum) | |
for epoch in range(0, epochs): | |
print('epoch={}'.format(epoch)) | |
# adjust learning rate. Divide it by 2 every 10 epochs | |
lr = base_lr*(0.5**(epoch//lr_freq_adj)) | |
for param_group in optimizer.state_dict()['param_groups']: | |
param_group['lr']=lr | |
# train for one epoch | |
train_epoch(train_loader, model, criterion, optimizer, epoch, normal) | |
print("train epoch ended, calculate validation loss") | |
val_loss = validate_epoch(test_loader, model, criterion, epoch, normal) | |
print("validation loss calculated, choose if best") | |
print("Best loss:"+str(best_loss)+", val loss:"+str(val_loss)) | |
is_best = val_loss < best_loss | |
best_loss = min(val_loss, best_loss) | |
losses.append(val_loss) | |
print("Best chosen, save checkpoint") | |
save_checkpoint({'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_loss':best_loss}, is_best, normal,dataname) | |
print("Checkpoint saved") | |
print("Losses:"+str(losses)) | |
dflosses=pd.DataFrame(losses) | |
dflosses.to_csv(dataname+'_losses.csv', index=False) | |
################################################################################################### | |
################################################################################################### | |
################################################################################################### | |
def pad_sequence(sequences, batch_first=False, padding_value=20): | |
max_size = sequences[0].size() | |
trailing_dims = max_size[1:] | |
max_len = max([s.size(0) for s in sequences]) | |
out_dims = (len(sequences), max_len) + trailing_dims | |
out_tensor = Variable(sequences[0]).data.new(*out_dims).fill_(0) | |
for i, tensor in enumerate(sequences): | |
length = tensor.size(0) | |
# use index notation to prevent duplicate references to the tensor | |
out_tensor[i, :length, ...] = tensor | |
if length<max_len: | |
out_tensor[i, length:,0,padding_value]=1 | |
return out_tensor | |
class PadSequence: | |
def __call__(self, batch): | |
# Let's assume that each element in "batch" is a tuple (data, label). | |
# Sort the batch in the descending order | |
sorted_batch = sorted(batch, key=lambda x: x[2].shape[0], reverse=True) | |
new_batch_size=len(batch) | |
sequences = [x[1] for x in sorted_batch] | |
size_sequences = [x[2] for x in sorted_batch] | |
padding_value=len(alphabett)-1 | |
size_padding_value=len(size_alphabett)-1 | |
sequences_padded = pad_sequence(sequences, batch_first=True,padding_value=padding_value) | |
size_sequences_padded = pad_sequence(size_sequences, batch_first=True,padding_value=size_padding_value) | |
# Also need to store the length of each sequence | |
# This is later needed in order to unpad the sequences | |
lengths = torch.LongTensor([len(x) for x in sequences]) | |
# Don't forget to grab the labels of the *sorted* batch | |
labelss=[x[3] for x in sorted_batch] | |
size_labelss=[x[4] for x in sorted_batch] | |
out_dim = max([x.size(0) for x in labelss]) | |
cats = torch.stack([x[0] for x in sorted_batch]).squeeze(1) | |
targets=torch.LongTensor(len(labelss),out_dim)*0+padding_value | |
size_targets=torch.LongTensor(len(size_labelss),out_dim)*0+size_padding_value | |
for i in range(len(labelss)): | |
outter=labelss[i].size(0) | |
targets[i,0:outter]=labelss[i].view(1,-1) | |
size_targets[i,0:outter]=size_labelss[i].view(1,-1) | |
SrcAddrs=[x[5] for x in sorted_batch] | |
session_ids=[x[6] for x in sorted_batch] | |
return (cats, sequences_padded,size_sequences_padded, lengths, targets, size_targets, SrcAddrs, session_ids, new_batch_size) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment