Skip to content

Instantly share code, notes, and snippets.

@hc2116
Last active November 30, 2021 18:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hc2116/b7bd37d76b892938d32521b484817e2c to your computer and use it in GitHub Desktop.
Save hc2116/b7bd37d76b892938d32521b484817e2c to your computer and use it in GitHub Desktop.
# RNN definition
class LSTM_deep(nn.Module):
def __init__(self, n_categories, seq_input_size,seq_size_input_size, batch_size, hidden_size):
super(LSTM_deep, self).__init__()
dropout = 0.2
self.num_layers=num_layers=2
self.bidirectional = bidirectional = False
embed_dim1=10
embed_dim2=5
embed_dim3=3
self.hidden_size = hidden_size
self.hidden_size2 = hidden_size2 = hidden_size
self.cn_size = hidden_size
#self.batch_size = batch_size
self.seq_input_size = seq_input_size
self.n_categories = n_categories
############################
# would be better to use nn.embed
self.embed_state=nn.Linear(seq_input_size, embed_dim1)
self.embed_size=nn.Linear(seq_size_input_size, embed_dim2)
self.embed_cat=nn.Linear(n_categories, embed_dim3)
############################
self.i2h = nn.LSTM(input_size=embed_dim1 + embed_dim2 + embed_dim3,
hidden_size=hidden_size,
num_layers=num_layers, dropout=dropout, bidirectional=bidirectional)
self.o2o = nn.Sequential(nn.Linear(hidden_size, hidden_size2), nn.ReLU(inplace=True))
self.o2o2 = nn.Linear(hidden_size2, seq_input_size)
self.size_o2o2 = nn.Linear(hidden_size2, seq_size_input_size)
self.dropout = nn.Dropout(dropout)
self.softmax = nn.LogSoftmax()
def forward(self, category, seq_input,seq_size_input, hidden, cn, new_batch_size):
seq_input_combined_LSTM = torch.cat((self.embed_cat(category.view(new_batch_size,1,self.n_categories)),
self.embed_state(seq_input),
self.embed_size(seq_size_input)),2).transpose(0,1)
outputLSTM, (hidden, cn) = self.i2h(seq_input_combined_LSTM, (hidden, cn))
outputLSTM=outputLSTM.view(new_batch_size,self.hidden_size)
self.dropout(outputLSTM)
output = self.o2o(outputLSTM)
state_output = self.o2o2(output)
size_output = self.o2o2(output)
state_output = self.softmax(state_output)
size_output = self.softmax(size_output)
return output,size_output, hidden, cn
def initHidden(self, new_batch_size):
return Variable(torch.zeros(self.num_layers, new_batch_size, self.hidden_size))
def initcn(self, new_batch_size):
return Variable(torch.zeros(self.num_layers, new_batch_size, self.cn_size))
def train_epoch(train_loader, model, criterion, optimizer, epoch, normal):
"""
Train for one epoch
"""
model.train()
print('inside train')
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id,
new_batch_size) in enumerate(train_loader):
if (i<10):
print('i='+str(i))
if (i<100)&(i%10==0):
print('i='+str(i))
if (i<1000)&(i%100==0):
print('i='+str(i))
if i%1000==0:
print('i='+str(i))
# we will need to convert them to Variable to use gradients.
category = Variable(category)
seq_input = Variable(seq_input.transpose(0,1))
seq_size_input = Variable(seq_size_input.transpose(0,1))
target = Variable(target.transpose(0,1))
size_target = Variable(size_target.transpose(0,1))
hidden = model.initHidden(new_batch_size)
############## LSTM addition ##############
cn = model.initcn(new_batch_size)
#############################################
model.zero_grad()
loss = 0
for flow in range(seq_input.size()[0]):
############## LSTM addition ##############
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size)
#############################################
#loss+=criterion(state_output, target[flow])
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow])
loss.backward()
optimizer.step()
def validate_epoch(val_loader, model, criterion, epoch, normal):
losses = AverageMeter()
# switch to evaluation mode so that the Dropout doesn't drop neurons
model.eval()
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id,
new_batch_size) in enumerate(val_loader):
if (i<10):
print('i={}'.format(i))
if (i<100)&(i%10==0):
print('i={}'.format(i))
if (i<1000)&(i%100==0):
print('i={}'.format(i))
if i%1000==0:
print('i={}'.format(i))
category = Variable(category)
seq_input = Variable(seq_input.transpose(0,1))
seq_size_input = Variable(seq_size_input.transpose(0,1))
target = Variable(target.transpose(0,1))
size_target = Variable(size_target.transpose(0,1))
hidden = model.initHidden(new_batch_size)
############## LSTM addition ##############
cn = model.initcn(new_batch_size)
#############################################
model.zero_grad()
loss = 0
for flow in range(seq_input.size()[0]):
############## LSTM addition ##############
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size)
#############################################
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow])
print(loss.item())
losses.update(loss.item(), seq_input.size()[0])
with open('validate_log.txt', 'a') as f:
f.write('Epoch: [{0}][{1}/{2}]\t'
'loss: {loss}\n'.format(epoch, i, len(val_loader),loss=loss.item()))
return(losses.avg)
def predict(df,df_seq_input,categories,alphabet,size_alphabet,dataname,n_hidden=200):
"""
This function loads the trained rnn and returns prediction scores for normal hosts and infected hosts in test set
It also returns the cumulative averages of the scores
"""
print('prediction starts')
n_categories = len(categories)
batch_size = 1
#n_hidden=200
global alphabett
alphabett = alphabet
global size_alphabett
size_alphabett = size_alphabet
checkpoint = torch.load(PATH_MODEL+dataname+'_normal_model_best_long.pth.tar')
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet),
batch_size=batch_size, hidden_size=n_hidden)
model.load_state_dict(checkpoint['state_dict'])
pred_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==3], alphabet,size_alphabet, categories),
batch_size=batch_size, shuffle=True, collate_fn=PadSequence())
model.eval()
output_stats = {'SrcAddr':[], 'session_id':[], 'mean':[], 'median':[],
'probs':[],'size_probs':[], 'sesslen':[], 'probsmax':[], 'Ports':[], 'Portsmax':[],
'size_probsmax':[], 'Sizes':[], 'Sizemax':[],
'meanmax':[], 'medianmax':[]}
row_indicator=[x for x in range(batch_size)]
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id,
new_batch_size) in enumerate(pred_loader):
if (i<10):
print('i={}'.format(i))
if (i<100)&(i%10==0):
print('i={}'.format(i))
if (i<1000)&(i%100==0):
print('i={}'.format(i))
if i%1000==0:
print('i={}'.format(i))
probs = []
probsmax = []
Ports = []
Portsmax = ['-']
####################################################################
size_probs = []
size_probsmax = []
Sizes = []
Sizesmax = ['-']
####################################################################
category = Variable(category)
seq_input = Variable(seq_input.transpose(0,1))
seq_size_input = Variable(seq_size_input.transpose(0,1))
target = Variable(target.transpose(0,1))
size_target = Variable(size_target.transpose(0,1))
hidden = model.initHidden(new_batch_size)
cn = model.initcn(new_batch_size)
for flow in range(seq_input.size()[0]):
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size)
value, index = seq_input[flow].squeeze().max(0)
size_value, size_index = seq_size_input[flow].squeeze().max(0)
Port=alphabett[index.item()]
prob_next_flow = np.array(list(state_output.data[row_indicator,target.data.numpy()[flow]]))
value, index = state_output.data[row_indicator,].squeeze().max(0)
probmax = np.exp(value.item())
#print(index.item())
#print(len(alphabett))
Portmax = alphabett[index.item()]
########################################################################################
Size=size_alphabett[size_index.item()]
size_prob_next_flow = np.array(list(size_output.data[row_indicator,size_target.data.numpy()[flow]]))
size_value, size_index = state_output.data[row_indicator,].squeeze().max(0)
size_probmax = np.exp(size_value.item())
Sizemax = size_alphabett[size_index.item()]
########################################################################################
probs.extend(list(1-np.exp(prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax
probsmax.append(1-size_probmax)
Ports.append(Port)
Portsmax.append(Portmax)
########################################################################################
size_probs.extend(list(1-np.exp(size_prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax
size_probsmax.append(1-probmax)
Sizes.append(Size)
Sizesmax.append([Sizemax])
Ports.append('EOS')
output_stats['mean'].append(np.mean(np.array(probs)))
output_stats['median'].append(np.median(np.array(probs)))
output_stats['SrcAddr'].append(SrcAddr[0])
output_stats['session_id'].append(session_id[0])
output_stats['probs'].append(str(probs))
output_stats['probsmax'].append(str(probsmax))
output_stats['size_probs'].append(str(size_probs))
output_stats['size_probsmax'].append(str(size_probmax))
output_stats['meanmax'].append(np.mean(np.array(probsmax)))
output_stats['medianmax'].append(np.median(np.array(probsmax)))
output_stats['Ports'].append(str(Ports))
output_stats['Sizes'].append(str(Sizes))
output_stats['Portsmax'].append(str(Portsmax))
output_stats['Sizemax'].append(str(Sizesmax))
output_stats['sesslen'].append(seq_input.size()[0])
df_output = pd.DataFrame(data=output_stats)
df_output = df_output.sort_values(['SrcAddr', 'session_id'])
# for each session, we will also check the previous session to compute cumulative average scores
df_output2 = df_output.assign(sum_means = df_output.groupby('SrcAddr')['mean'].cumsum())
df_output2 = df_output2.assign(count_previous = df_output2.groupby('SrcAddr')['mean'].cumcount()+1)
df_output2 = df_output2.assign(sum_medians = df_output2.groupby('SrcAddr')['median'].cumsum())
df_output2 = df_output2.assign(avg_previous_means = lambda x: x['sum_means']/x['count_previous'])
df_output2 = df_output2.assign(avg_previous_medians = lambda x: x['sum_medians']/x['count_previous'])
df_output2.to_csv(dataname+'_output_test.csv', index=False)
return df_output2
def train(df_seq_input,categories,alphabet,size_alphabet,dataname,epochs=250, base_lr=0.0003, n_hidden=200,batch_size=25):
# training hyperparameters. These will probably need to be tuned.
global alphabett
alphabett = alphabet
global size_alphabett
size_alphabett=size_alphabet
print('Starting main')
losses=[]
lr = base_lr
#n_hidden=200
weight_decay = 5e-4
momentum = 0.9
lr_freq_adj = 50
normal=True
n_categories = len(categories)
best_loss = 1000000
#in_log.txt we define model
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet),
batch_size=batch_size, hidden_size=n_hidden)
# we define loss function
criterion = nn.NLLLoss() # we define Negatve Loglikelihood lost because We are doing a multinomial classification.
train_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==1], alphabet,size_alphabet, categories),
batch_size=batch_size, shuffle=True, collate_fn=PadSequence())
# test loader
test_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==2], alphabet,size_alphabet, categories),
batch_size=batch_size, shuffle=True, collate_fn=PadSequence())
# optimizer: Stochastic Gradient Descent
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
for epoch in range(0, epochs):
print('epoch={}'.format(epoch))
# adjust learning rate. Divide it by 2 every 10 epochs
lr = base_lr*(0.5**(epoch//lr_freq_adj))
for param_group in optimizer.state_dict()['param_groups']:
param_group['lr']=lr
# train for one epoch
train_epoch(train_loader, model, criterion, optimizer, epoch, normal)
print("train epoch ended, calculate validation loss")
val_loss = validate_epoch(test_loader, model, criterion, epoch, normal)
print("validation loss calculated, choose if best")
print("Best loss:"+str(best_loss)+", val loss:"+str(val_loss))
is_best = val_loss < best_loss
best_loss = min(val_loss, best_loss)
losses.append(val_loss)
print("Best chosen, save checkpoint")
save_checkpoint({'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_loss':best_loss}, is_best, normal,dataname)
print("Checkpoint saved")
print("Losses:"+str(losses))
dflosses=pd.DataFrame(losses)
dflosses.to_csv(dataname+'_losses.csv', index=False)
###################################################################################################
###################################################################################################
###################################################################################################
def pad_sequence(sequences, batch_first=False, padding_value=20):
max_size = sequences[0].size()
trailing_dims = max_size[1:]
max_len = max([s.size(0) for s in sequences])
out_dims = (len(sequences), max_len) + trailing_dims
out_tensor = Variable(sequences[0]).data.new(*out_dims).fill_(0)
for i, tensor in enumerate(sequences):
length = tensor.size(0)
# use index notation to prevent duplicate references to the tensor
out_tensor[i, :length, ...] = tensor
if length<max_len:
out_tensor[i, length:,0,padding_value]=1
return out_tensor
class PadSequence:
def __call__(self, batch):
# Let's assume that each element in "batch" is a tuple (data, label).
# Sort the batch in the descending order
sorted_batch = sorted(batch, key=lambda x: x[2].shape[0], reverse=True)
new_batch_size=len(batch)
sequences = [x[1] for x in sorted_batch]
size_sequences = [x[2] for x in sorted_batch]
padding_value=len(alphabett)-1
size_padding_value=len(size_alphabett)-1
sequences_padded = pad_sequence(sequences, batch_first=True,padding_value=padding_value)
size_sequences_padded = pad_sequence(size_sequences, batch_first=True,padding_value=size_padding_value)
# Also need to store the length of each sequence
# This is later needed in order to unpad the sequences
lengths = torch.LongTensor([len(x) for x in sequences])
# Don't forget to grab the labels of the *sorted* batch
labelss=[x[3] for x in sorted_batch]
size_labelss=[x[4] for x in sorted_batch]
out_dim = max([x.size(0) for x in labelss])
cats = torch.stack([x[0] for x in sorted_batch]).squeeze(1)
targets=torch.LongTensor(len(labelss),out_dim)*0+padding_value
size_targets=torch.LongTensor(len(size_labelss),out_dim)*0+size_padding_value
for i in range(len(labelss)):
outter=labelss[i].size(0)
targets[i,0:outter]=labelss[i].view(1,-1)
size_targets[i,0:outter]=size_labelss[i].view(1,-1)
SrcAddrs=[x[5] for x in sorted_batch]
session_ids=[x[6] for x in sorted_batch]
return (cats, sequences_padded,size_sequences_padded, lengths, targets, size_targets, SrcAddrs, session_ids, new_batch_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment