Last active
September 23, 2022 21:24
-
-
Save dtrizna/b0b9ccc488da59fcc7090a21eba93317 to your computer and use it in GitHub Desktop.
Transform Microsoft XML into pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import sys | |
from lxml import etree | |
def read_xml(FILENAME): | |
parser = etree.XMLParser(recover=True) | |
with open(FILENAME) as file: | |
data = file.readlines() | |
# ignore XML documentation's tag (1st line), so taking only data[1] | |
raw = etree.fromstring(data[1], parser=parser) | |
return raw | |
def events_to_df(eventlist): | |
df = pd.DataFrame() | |
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}' | |
for idx, event in enumerate(eventlist): | |
edict = {} | |
for element in event.iterdescendants(): | |
if any(x in element.tag for x in ['TimeCreated', 'Execution', 'Security']): | |
for item in element.items(): | |
edict[item[0]] = item[1] | |
# filter out empty fields | |
elif any(x in element.tag for x in ['Provider', 'System', 'Correlation']): | |
pass | |
elif 'Data' in element.tag: | |
for item in element.items(): | |
edict[item[1]] = element.text | |
else: | |
edict[element.tag.replace(tag,'')] = element.text | |
# add raw text event to have ability always access full value of eventlog | |
edict['raw'] = etree.tostring(event, pretty_print=True).decode() | |
edf = pd.DataFrame(edict, index=[idx]) | |
df = df.append(edf, sort=True) | |
return df | |
def main(FILENAME): | |
xml = read_xml(FILENAME) | |
print('[!] Found XML file! Preprocessing...') | |
# get all events in list | |
events = [] | |
# we see prefix on every tag, define that | |
tag = '{http://schemas.microsoft.com/win/2004/08/events/event}' | |
for element in xml.iter(tag+'Event'): | |
events.append(element) | |
# transform to dataframe | |
logdf = events_to_df(events) | |
print('[+] File parsed!') | |
print(logdf.head()) | |
if __name__ == '__main__': | |
main(sys.argv[1]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Could you please share an example log file? I am having issue to run this code for my sample data, it only generates 1 raw in dataframe. I am trying to understand why its not working for all data