FaVorith/batch-large-file.py

## batch-large-file.py
# coding: utf-8

__author__ = 'Fabian Voith'
__version__ = '1.0.0'
__email__ = 'admin@fabian-voith.de'

import pandas as pd

# The CSV file (can be in zip file) MUST have the columns "Event Name", "Event ID (custom)", and "Unnamed: 9" (containing the Payload).
# Apart from that, the following structure is recommended:
#['Event Name',
# 'Low Level Category',
# 'Source IP',
# 'Source Port',
# 'Destination IP',
# 'Destination Port',
# 'Username',
# 'Event ID (custom)',
# 'AgentLogFile (custom)',
# 'Unnamed: 9',
# 'Unnamed: 10',
# 'Unnamed: 11']

filename = r'*Insert your filename to be processed here; can also be zipped, e.g. C:\file.zip*'

processed_ids = []

first_run = True
chunk_counter = 0

chunksize = 10 ** 4  # number of rows per chunk
for chunk in pd.read_csv(filename, chunksize=chunksize, sep=','):

    chunk_counter += 1
    print('Working on chunk ' + str(chunk_counter))


    # Rename columns
    chunk.rename(columns={'EventID (custom)':'EventID', 'Unnamed: 9':'Payload',
                          'Event Name':'EventName'}, inplace=True)

    # Drop unused Payload data
    # axis=1 is for columns, axis=0 for rows
    chunk.drop({'Unnamed: 10','Unnamed: 11'}, axis=1, inplace=True)

    # Keep only one EventID and drop duplicates
    chunk = chunk.drop_duplicates('EventID', keep='last')


    if first_run:
        # create new Dataframe, copy structure from original table
        new_df = chunk.reindex_like(chunk)
        # drop empty cells
        new_df.dropna(inplace=True)
        first_run = False


    # combine chunk with all eventIDs we have so far
    new_df = pd.concat([chunk, new_df])


if chunk_counter > 1:
    # if we read more than one chunk, just removing duplicates from the chunk might still result in duplicates
    # because we are merging several chunks to a new DF. So each chunk is free of duplicates, but among the chunks
    # there might be identical Event IDs, so we need to remove duplicates from the new DF as well:
    new_df = new_df.drop_duplicates('EventID', keep='last')

new_df = new_df.sort_values(['EventID'], ascending=[True])

print('Writing file...')
new_df.to_excel('event_id_examples.xlsx')

print('Done. Wrote ' + str(len(new_df)) + ' lines.')
	# coding: utf-8

	__author__ = 'Fabian Voith'
	__version__ = '1.0.0'
	__email__ = 'admin@fabian-voith.de'

	import pandas as pd

	# The CSV file (can be in zip file) MUST have the columns "Event Name", "Event ID (custom)", and "Unnamed: 9" (containing the Payload).
	# Apart from that, the following structure is recommended:
	#['Event Name',
	# 'Low Level Category',
	# 'Source IP',
	# 'Source Port',
	# 'Destination IP',
	# 'Destination Port',
	# 'Username',
	# 'Event ID (custom)',
	# 'AgentLogFile (custom)',
	# 'Unnamed: 9',
	# 'Unnamed: 10',
	# 'Unnamed: 11']

	filename = r'Insert your filename to be processed here; can also be zipped, e.g. C:\file.zip'

	processed_ids = []

	first_run = True
	chunk_counter = 0

	chunksize = 10 ** 4 # number of rows per chunk
	for chunk in pd.read_csv(filename, chunksize=chunksize, sep=','):

	chunk_counter += 1
	print('Working on chunk ' + str(chunk_counter))


	# Rename columns
	chunk.rename(columns={'EventID (custom)':'EventID', 'Unnamed: 9':'Payload',
	'Event Name':'EventName'}, inplace=True)

	# Drop unused Payload data
	# axis=1 is for columns, axis=0 for rows
	chunk.drop({'Unnamed: 10','Unnamed: 11'}, axis=1, inplace=True)

	# Keep only one EventID and drop duplicates
	chunk = chunk.drop_duplicates('EventID', keep='last')


	if first_run:
	# create new Dataframe, copy structure from original table
	new_df = chunk.reindex_like(chunk)
	# drop empty cells
	new_df.dropna(inplace=True)
	first_run = False


	# combine chunk with all eventIDs we have so far
	new_df = pd.concat([chunk, new_df])


	if chunk_counter > 1:
	# if we read more than one chunk, just removing duplicates from the chunk might still result in duplicates
	# because we are merging several chunks to a new DF. So each chunk is free of duplicates, but among the chunks
	# there might be identical Event IDs, so we need to remove duplicates from the new DF as well:
	new_df = new_df.drop_duplicates('EventID', keep='last')

	new_df = new_df.sort_values(['EventID'], ascending=[True])

	print('Writing file...')
	new_df.to_excel('event_id_examples.xlsx')

	print('Done. Wrote ' + str(len(new_df)) + ' lines.')