noahwilliamsson/slack-parse-html-logs.py

## slack-parse-html-logs.py
#!/usr/bin/env python
#
# Parse Slack's HTML formatted audit access logs (https://theteam.slack.com/admin/logs)
# ..because structured audit logs is apparently an enterprise feature
#
#   -- noah@hack.se
#
#
# 1. Open the browser's developer console
# 2. Open https://your-unique-subdomain.slack.com/admin/logs?page=1 and note how many pages are available (e.g. 2175)
# 3. Select "Copy as cURL" on the HTML page
# 4. Run 'for i in $(seq 2175); do sleep 1; curl ...?page=$i ... ; done' to download HTML logs
# 5. Parse the data with the below command
#
# Usage:
#   ./slack-parse-html-logs.py [-json] <1.html> [2.html, ..]
#
# Examples:
#   ls -1 *.html |sort -n| xargs python slack-parse-html-logs.py > log.txt
#   ls -1 *.html |sort -n| xargs python slack-parse-html-logs.py -json > log.json
#
from __future__ import print_function
import sys
import re
import json
from dateutil.parser import parse
from xml.sax.saxutils import unescape


def parse_entry(e, day):
    fail = None
    obj = {
        'name': None,
        'user': None,
        'ts': parse(day),
        'ip': None,
        'netid': None,
        'ua': None,
        'client': None,
    }
    m = re.findall('class="bold">(.*?)<', e)
    if m:
        obj['name'] = m[0]
    m = re.findall('href="/team/(U\w+)"', e)
    if m:
        obj['user'] = m[0]

    m = re.findall('<br>([0-9:APM ]+)', e)
    if m:
        # This will be the browser's local time
        obj['ts'] = str(parse(' '.join([day, m[0]])))
    else:
        fail = 'timestamp'

    m = re.findall('IP:? ([0-9a-fA-F:.]+)\s*(\((.*?)\))?\.(\s+There.*)?\s*<', e)
    # [('30.0.0.30', '(United States; DoD)', 'United States; DoD', '  There was one additional login from this device, at 7:46 PM. ')]
    if m:
        obj['ip'] = m[0][0]
        obj['netid'] = m[0][2].strip() if m[0][2] else None
    else:
        fail = 'ip address'

    m = re.findall('<abbr title="([^"]*)".*?>.*?([\w\s()]+)?</abbr>(<a href="([^"]+)">([^<]+))?', e)
    #[('com.tinyspeck.chatlyio/19.3.2 (iPhone; iOS 11.4.1; Scale/2.00)', ' iOS App (iPhone)', '', '', '')]
    #[('node-fetch/1.0 (+https://github.com/bitinn/node-fetch)', '', '<a href="/apps/A2RPP3NFR">Jira Cloud', '/apps/A2RPP3NFR', 'Jira Cloud')]
    if m:
        obj['client'] = m[0][1].strip() if m[0][1] else m[0][4]
        obj['ua'] = unescape(m[0][0]) if m[0][0] else obj['client']
    else:
        fail = 'useragent'

    if fail:
        print(json.dumps(obj), file=sys.stderr)
        print('PARSE ERROR ({})'.format(fail), e, file=sys.stderr)
    return obj

def process_file(filename):
    with open(filename, 'r') as f:
        data = f.read()

    sections = re.findall('<section class="tab_pane selected" data-tab="logs">(.*?)</section>', data, re.MULTILINE|re.DOTALL)
    divs = re.findall('<div[^>]+>(.*?)</div>', sections[0])
    for data in divs:
        m = re.findall('<p class="bold no_bottom_margin">(.*?)</p>', data)
        if m:
            day = m[0]
            continue

        entries = re.findall('<p style="padding-left: 2.25rem;" class="position_relative">(.*?)</p>', data)
        for e in entries:
            obj = parse_entry(e, day)
            if json_output:
                print(json.dumps(obj))
            else:
                print('{}\t{}\t{}\t{} ({})\t{}'.format(obj['ts'], obj['client'], obj['name'], obj['ip'], obj['netid'], obj['ua']))


if __name__ == '__main__':
    json_output = False
    files = sys.argv[1:]
    if files and '-json' in files[0]:
        json_output = True
        files = files[1:]

    for fn in files:
        process_file(fn)
	#!/usr/bin/env python
	#
	# Parse Slack's HTML formatted audit access logs (https://theteam.slack.com/admin/logs)
	# ..because structured audit logs is apparently an enterprise feature
	#
	# -- noah@hack.se
	#
	#
	# 1. Open the browser's developer console
	# 2. Open https://your-unique-subdomain.slack.com/admin/logs?page=1 and note how many pages are available (e.g. 2175)
	# 3. Select "Copy as cURL" on the HTML page
	# 4. Run 'for i in $(seq 2175); do sleep 1; curl ...?page=$i ... ; done' to download HTML logs
	# 5. Parse the data with the below command
	#
	# Usage:
	# ./slack-parse-html-logs.py [-json] <1.html> [2.html, ..]
	#
	# Examples:
	# ls -1 *.html \|sort -n\| xargs python slack-parse-html-logs.py > log.txt
	# ls -1 *.html \|sort -n\| xargs python slack-parse-html-logs.py -json > log.json
	#
	from __future__ import print_function
	import sys
	import re
	import json
	from dateutil.parser import parse
	from xml.sax.saxutils import unescape


	def parse_entry(e, day):
	fail = None
	obj = {
	'name': None,
	'user': None,
	'ts': parse(day),
	'ip': None,
	'netid': None,
	'ua': None,
	'client': None,
	}
	m = re.findall('class="bold">(.*?)<', e)
	if m:
	obj['name'] = m[0]
	m = re.findall('href="/team/(U\w+)"', e)
	if m:
	obj['user'] = m[0]

	m = re.findall('<br>([0-9:APM ]+)', e)
	if m:
	# This will be the browser's local time
	obj['ts'] = str(parse(' '.join([day, m[0]])))
	else:
	fail = 'timestamp'

	m = re.findall('IP:? ([0-9a-fA-F:.]+)\s(\((.?)\))?\.(\s+There.)?\s<', e)
	# [('30.0.0.30', '(United States; DoD)', 'United States; DoD', ' There was one additional login from this device, at 7:46 PM. ')]
	if m:
	obj['ip'] = m[0][0]
	obj['netid'] = m[0][2].strip() if m[0][2] else None
	else:
	fail = 'ip address'

	m = re.findall('<abbr title="([^"])".?>.*?([\w\s()]+)?</abbr>(<a href="([^"]+)">([^<]+))?', e)
	#[('com.tinyspeck.chatlyio/19.3.2 (iPhone; iOS 11.4.1; Scale/2.00)', ' iOS App (iPhone)', '', '', '')]
	#[('node-fetch/1.0 (+https://github.com/bitinn/node-fetch)', '', '<a href="/apps/A2RPP3NFR">Jira Cloud', '/apps/A2RPP3NFR', 'Jira Cloud')]
	if m:
	obj['client'] = m[0][1].strip() if m[0][1] else m[0][4]
	obj['ua'] = unescape(m[0][0]) if m[0][0] else obj['client']
	else:
	fail = 'useragent'

	if fail:
	print(json.dumps(obj), file=sys.stderr)
	print('PARSE ERROR ({})'.format(fail), e, file=sys.stderr)
	return obj

	def process_file(filename):
	with open(filename, 'r') as f:
	data = f.read()

	sections = re.findall('<section class="tab_pane selected" data-tab="logs">(.*?)</section>', data, re.MULTILINE\|re.DOTALL)
	divs = re.findall('<div[^>]+>(.*?)</div>', sections[0])
	for data in divs:
	m = re.findall('<p class="bold no_bottom_margin">(.*?)</p>', data)
	if m:
	day = m[0]
	continue

	entries = re.findall('<p style="padding-left: 2.25rem;" class="position_relative">(.*?)</p>', data)
	for e in entries:
	obj = parse_entry(e, day)
	if json_output:
	print(json.dumps(obj))
	else:
	print('{}\t{}\t{}\t{} ({})\t{}'.format(obj['ts'], obj['client'], obj['name'], obj['ip'], obj['netid'], obj['ua']))


	if __name__ == '__main__':
	json_output = False
	files = sys.argv[1:]
	if files and '-json' in files[0]:
	json_output = True
	files = files[1:]

	for fn in files:
	process_file(fn)