Skip to content

Instantly share code, notes, and snippets.

@dtrizna
Last active September 23, 2022 21:24
Show Gist options
  • Save dtrizna/b0b9ccc488da59fcc7090a21eba93317 to your computer and use it in GitHub Desktop.
Save dtrizna/b0b9ccc488da59fcc7090a21eba93317 to your computer and use it in GitHub Desktop.
Transform Microsoft XML into pandas DataFrame
import pandas as pd
import sys
from lxml import etree
def read_xml(FILENAME):
parser = etree.XMLParser(recover=True)
with open(FILENAME) as file:
data = file.readlines()
# ignore XML documentation's tag (1st line), so taking only data[1]
raw = etree.fromstring(data[1], parser=parser)
return raw
def events_to_df(eventlist):
df = pd.DataFrame()
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}'
for idx, event in enumerate(eventlist):
edict = {}
for element in event.iterdescendants():
if any(x in element.tag for x in ['TimeCreated', 'Execution', 'Security']):
for item in element.items():
edict[item[0]] = item[1]
# filter out empty fields
elif any(x in element.tag for x in ['Provider', 'System', 'Correlation']):
pass
elif 'Data' in element.tag:
for item in element.items():
edict[item[1]] = element.text
else:
edict[element.tag.replace(tag,'')] = element.text
# add raw text event to have ability always access full value of eventlog
edict['raw'] = etree.tostring(event, pretty_print=True).decode()
edf = pd.DataFrame(edict, index=[idx])
df = df.append(edf, sort=True)
return df
def main(FILENAME):
xml = read_xml(FILENAME)
print('[!] Found XML file! Preprocessing...')
# get all events in list
events = []
# we see prefix on every tag, define that
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}'
for element in xml.iter(tag+'Event'):
events.append(element)
# transform to dataframe
logdf = events_to_df(events)
print('[+] File parsed!')
print(logdf.head())
if __name__ == '__main__':
main(sys.argv[1])
@seymanurmutlu
Copy link

Could you please share an example log file? I am having issue to run this code for my sample data, it only generates 1 raw in dataframe. I am trying to understand why its not working for all data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment