This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
__author__ = 'Fabian Voith' | |
__version__ = '1.0.0' | |
__email__ = 'admin@fabian-voith.de' | |
import pandas as pd | |
# The CSV file (can be in zip file) MUST have the columns "Event Name", "Event ID (custom)", and "Unnamed: 9" (containing the Payload). | |
# Apart from that, the following structure is recommended: | |
#['Event Name', | |
# 'Low Level Category', | |
# 'Source IP', | |
# 'Source Port', | |
# 'Destination IP', | |
# 'Destination Port', | |
# 'Username', | |
# 'Event ID (custom)', | |
# 'AgentLogFile (custom)', | |
# 'Unnamed: 9', | |
# 'Unnamed: 10', | |
# 'Unnamed: 11'] | |
filename = r'*Insert your filename to be processed here; can also be zipped, e.g. C:\file.zip*' | |
processed_ids = [] | |
first_run = True | |
chunk_counter = 0 | |
chunksize = 10 ** 4 # number of rows per chunk | |
for chunk in pd.read_csv(filename, chunksize=chunksize, sep=','): | |
chunk_counter += 1 | |
print('Working on chunk ' + str(chunk_counter)) | |
# Rename columns | |
chunk.rename(columns={'EventID (custom)':'EventID', 'Unnamed: 9':'Payload', | |
'Event Name':'EventName'}, inplace=True) | |
# Drop unused Payload data | |
# axis=1 is for columns, axis=0 for rows | |
chunk.drop({'Unnamed: 10','Unnamed: 11'}, axis=1, inplace=True) | |
# Keep only one EventID and drop duplicates | |
chunk = chunk.drop_duplicates('EventID', keep='last') | |
if first_run: | |
# create new Dataframe, copy structure from original table | |
new_df = chunk.reindex_like(chunk) | |
# drop empty cells | |
new_df.dropna(inplace=True) | |
first_run = False | |
# combine chunk with all eventIDs we have so far | |
new_df = pd.concat([chunk, new_df]) | |
if chunk_counter > 1: | |
# if we read more than one chunk, just removing duplicates from the chunk might still result in duplicates | |
# because we are merging several chunks to a new DF. So each chunk is free of duplicates, but among the chunks | |
# there might be identical Event IDs, so we need to remove duplicates from the new DF as well: | |
new_df = new_df.drop_duplicates('EventID', keep='last') | |
new_df = new_df.sort_values(['EventID'], ascending=[True]) | |
print('Writing file...') | |
new_df.to_excel('event_id_examples.xlsx') | |
print('Done. Wrote ' + str(len(new_df)) + ' lines.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment