Skip to content

Instantly share code, notes, and snippets.

Created February 28, 2022 18:35
Show Gist options
  • Save michhar/388d037439da6114d67aa8f793293870 to your computer and use it in GitHub Desktop.
Save michhar/388d037439da6114d67aa8f793293870 to your computer and use it in GitHub Desktop.
Preprocessor for use with LogAnomaly (based on logdeep project)
Contributor: Micheleen Harris
Date: Feb. 20, 2022
Original source:
Purpose: Map event ids to an encoded semantics vector (specifically for loganomaly method)
- Uses the spellpy parser project: (need to pip install)
- Need the stop words Python file from SpaCy in project folder with this file:
- Example below are from Ubuntu system logs (normal and abnormal as deemed by user)
Get "cc.en.300.vec" by (on Linux; note, the unarchived file is ~4.5 GB):
mkdir vec_models
cd vec_models
gunzip cc.en.300.vec.gz
import re
import json
import io
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
from collections import OrderedDict, Counter
import math
from pprint import pprint
from datetime import datetime
from stop_words import StopWords
from spellpy import spell
class Preprocessor:
def __init__(self):
self.stop_words = StopWords().STOP_WORDS
def df_transfer(self, df, event_id_map):
year = [str(datetime.utcnow().year)]*df.shape[0]
timestamps = list(map(lambda a,b,c,d: a + '-' + b + '-' + str(c).rstrip() + ' ' + str(d),
df['datetime'] = pd.to_datetime(timestamps, errors='coerce')
df = df[['datetime', 'EventId']]
df['EventId'] = df['EventId'].apply(lambda e: event_id_map[e] if event_id_map.get(e) else -1)
deeplog_df = df.set_index('datetime').resample('1min').apply(self._custom_resampler).reset_index()
return deeplog_df
def _custom_resampler(self, array_like):
"""Can sample however is needed"""
return list(array_like)
def file_generator(self, filename, df):
with open(filename, 'w') as f:
for event_id_list in df['EventId']:
for event_id in event_id_list:
f.write(str(event_id) + ' ')
if len(event_id_list) > 0:
def normalize_text(self, text):
Normalize text to extract most salient tokens
Ref: turn a Unicode string to plain ASCII, thanks to
# replace special characters with space and remove digits
text = re.sub(r'\W+', ' ', text)
text = re.sub('\d', '', text)
# convert camel case to snake case, then replace _ with space
text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text).lower().replace('_', ' ')
# tokenize, removing stop words (from SpaCy)
normalized_tokens = [t for t in text.split(' ') if t not in self.stop_words and t != '']
return normalized_tokens
def dump2json(self, dump_dict, target_path):
Save json and any bytes-like objects to file
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj, encoding='utf-8')
return json.JSONEncoder.default(self, obj)
with open(target_path, 'w', encoding='utf-8') as file:
file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
def create_word2idf(self, log_train, eventid2template):
Create a word to IDF dict
TF = term frequency
IDF = inverse document frequency
idf_matrix = list()
for seq in log_train['EventId']:
for event in seq:
idf_matrix = np.array(idf_matrix)
X_counts = []
for i in range(idf_matrix.shape[0]):
word_counts = Counter(idf_matrix[i])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
events = X_df.columns
X = X_df.values
num_instance, num_event = X.shape
df_vec = np.sum(X > 0, axis=0)
# smooth idf like sklearn
idf_vec = np.log((num_instance + 1) / (df_vec + 1)) + 1
idf_matrix = X * np.tile(idf_vec, (num_instance, 1))
X_new = idf_matrix
word2idf = dict()
for i,j in zip(events,idf_vec):
# smooth idf when oov
word2idf['oov'] = (math.log((num_instance + 1) / (29+1)) + 1)
return word2idf
def create_semantic_vec(self, eventid2template, fasttext_map, word2idf):
event2semantic_vec = dict()
for event in eventid2template.keys():
template = eventid2template[event]
tem_len = len(template)
count = dict(Counter(template))
for word in count.keys():
# TF
TF = count[word]/tem_len
IDF = word2idf.get(word,word2idf['oov'])
count[word] = TF*IDF
value_sum = sum(count.values())
for word in count.keys():
count[word] = count[word]/value_sum
semantic_vec = np.zeros(300)
for word in count.keys():
fasttext_weight = np.array(fasttext_map[word])
except KeyError as ke:
# word not in fasttext
semantic_vec += count[word]*fasttext_weight
event2semantic_vec[event] = list(semantic_vec)
return event2semantic_vec
class FastTextProcessor:
"""Use fasttext vectors to generate map"""
def __init__(self):
self.template_set = set()
self.template_fasttext_map = {}
def create_template_set(self, result):
print('Creating template set')
for key in tqdm(result.keys()):
for word in result[key]:
def load_vectors(self, fname):
fin =, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
print('Loading vectors')
for line in tqdm(fin):
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
def create_map(self):
fasttext = self.load_vectors(os.path.join('vec_models', 'cc.en.300.vec'))
print('Creating fasttext map')
for word in tqdm(self.template_set):
self.template_fasttext_map[word] = list(fasttext[word])
except KeyError as ke:
# fasttext does not have word
return self.template_fasttext_map
if __name__ == "__main__":
preprocessor = Preprocessor()
# Parser #
input_dir = '../../data/'
output_dir = './results_spell/'
recreated_parse_logs = False
# "Content" is like the log message - what we want to parse
# the following is specific to the syslog, so match to those "columns"
log_format = '<Month> <Day> <Time> <MachineName> <Content>'
log_main = 'syslog'
tau = 0.5
parser = spell.LogParser(
# if the we wish, we can recreate the parsed csv's
if recreated_parse_logs:
for log_name in ['syslog.1.updated',
# Transformation #
# TODO: read from object, not file i/o again
df_train = pd.read_csv(f'{output_dir}/syslog.1.updated_structured.csv')
df_test_normal = pd.read_csv(f'{output_dir}/syslog.2.updated_structured.csv')
df_test_abnormal = pd.read_csv(f'{output_dir}/abnormal_states.log_structured.csv')
print('Number of classes for training = ', df_train['EventId'].unique().shape)
event_id_map = dict()
for i, event_id in enumerate(df_train['EventId'].unique(), 1):
event_id_map[event_id] = i
# Train Set
log_train = preprocessor.df_transfer(df_train, event_id_map)
preprocessor.file_generator('./results_preprocessor/train', log_train)
# Test Normal Set
log_test_normal = preprocessor.df_transfer(df_test_normal, event_id_map)
preprocessor.file_generator('./results_preprocessor/test_normal', log_test_normal)
# Test Abnormal Set
log_test_abnormal = preprocessor.df_transfer(df_test_abnormal, event_id_map)
preprocessor.file_generator('./results_preprocessor/test_abnormal', log_test_abnormal)
# Event to Template #
eventid2template = {}
print('Creating event IDs to templates')
for eid in tqdm(df_train['EventId'].unique()):
eventid2template[event_id_map[eid]] = preprocessor.normalize_text(
df_train.loc[event_id_map[eid], 'EventTemplate'])
preprocessor.dump2json(eventid2template, './results_preprocessor/eventid2template.json')
# Fasttext map #
fasttext_processor = FastTextProcessor()
template_fasttext_map = fasttext_processor.create_map()
preprocessor.dump2json(template_fasttext_map, './results_preprocessor/fasttext_map.json')
# Word to IDF #
word2idf = preprocessor.create_word2idf(log_train, eventid2template)
preprocessor.dump2json(word2idf, './results_preprocessor/word2idf.json')
# Event to Semantics Vector #
event2semantic_vec = preprocessor.create_semantic_vec(eventid2template, template_fasttext_map, word2idf)
Copy link

sq2309 commented Sep 16, 2022

while generating the eventid2template, since event_id_map[eid] gets from the 'unique()', so index should not be the original df_train's location any more. probably we can get the 'EventTemplate' from *_templates.csv with the correct 'EventId'?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment