Skip to content

Instantly share code, notes, and snippets.

Created November 21, 2020 14:18
What would you like to do?
# coding: utf-8
__author__ = 'Fabian Voith'
__version__ = '1.0.0'
__email__ = ''
import pandas as pd
# The CSV file (can be in zip file) MUST have the columns "Event Name", "Event ID (custom)", and "Unnamed: 9" (containing the Payload).
# Apart from that, the following structure is recommended:
#['Event Name',
# 'Low Level Category',
# 'Source IP',
# 'Source Port',
# 'Destination IP',
# 'Destination Port',
# 'Username',
# 'Event ID (custom)',
# 'AgentLogFile (custom)',
# 'Unnamed: 9',
# 'Unnamed: 10',
# 'Unnamed: 11']
filename = r'*Insert your filename to be processed here; can also be zipped, e.g. C:\*'
processed_ids = []
first_run = True
chunk_counter = 0
chunksize = 10 ** 4 # number of rows per chunk
for chunk in pd.read_csv(filename, chunksize=chunksize, sep=','):
chunk_counter += 1
print('Working on chunk ' + str(chunk_counter))
# Rename columns
chunk.rename(columns={'EventID (custom)':'EventID', 'Unnamed: 9':'Payload',
'Event Name':'EventName'}, inplace=True)
# Drop unused Payload data
# axis=1 is for columns, axis=0 for rows
chunk.drop({'Unnamed: 10','Unnamed: 11'}, axis=1, inplace=True)
# Keep only one EventID and drop duplicates
chunk = chunk.drop_duplicates('EventID', keep='last')
if first_run:
# create new Dataframe, copy structure from original table
new_df = chunk.reindex_like(chunk)
# drop empty cells
first_run = False
# combine chunk with all eventIDs we have so far
new_df = pd.concat([chunk, new_df])
if chunk_counter > 1:
# if we read more than one chunk, just removing duplicates from the chunk might still result in duplicates
# because we are merging several chunks to a new DF. So each chunk is free of duplicates, but among the chunks
# there might be identical Event IDs, so we need to remove duplicates from the new DF as well:
new_df = new_df.drop_duplicates('EventID', keep='last')
new_df = new_df.sort_values(['EventID'], ascending=[True])
print('Writing file...')
print('Done. Wrote ' + str(len(new_df)) + ' lines.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment