Skip to content

Instantly share code, notes, and snippets.

@Tatsh
Created June 16, 2013 23:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tatsh/5793797 to your computer and use it in GitHub Desktop.
Save Tatsh/5793797 to your computer and use it in GitHub Desktop.
Parses X-Chat 2 and HexChat logs and converts to usable JSON.
#!/usr/bin/env python
import argparse
import datetime
import glob
import json as JSON
import os
import re
import sys
from dateutil import parser as date_parser, tz
# Usage: ./parse-xchat-logs.py -t '-0500' DIRECTORY_TO_LOG_FILES
# X-Chat 2/HexChat log parser
# From Preferences > Logging (without quotes):
# Log filename: '%n/%Y%m%d-%c.log'
# Log timestamp foramt: '%b %d %H:%M:%S '
# This also supports the file name format: '%c-%y%m%d.log' (2-digit year suffix (after '20'), example: ##asm-071203.log)
FILENAME_REGEX = r'^(?:20(?P<year1>\d{2})\d{2}[^\-]+\-(?P<channel1>\#[^\.]+))|(?:(?P<channel2>\#[^\-]+)\-(?P<year2>\d{2}))'
USER_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+<(?P<nick>[^>]+)>\s+(?P<message>.*)'
CHANNEL_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+\*\t(?P<message>[^\ ]+)\s(.*)'
JSON.encode = JSON.dumps
JSON.decode = JSON.loads
def datetime_to_utc(date):
"""Returns date in UTC without tzinfo"""
return date.astimezone(tz.gettz('UTC')).replace(tzinfo=None) if date.tzinfo else date
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parser for X-Chat logs in default file name and log format')
parser.add_argument('path', help='Path to directory of logs')
parser.add_argument('-n', '--network-name', help='Network name')
parser.add_argument('-t', '--timezone', help='Timezone', default='-0500')
args = parser.parse_args()
path = os.path.abspath(args.path)
network_name = os.path.basename(path)
if args.network_name:
network_name = args.network_name
log_files = glob.glob(path + '/*.log')
txt_files = glob.glob(path + '/*.txt')
log_files.sort()
txt_files.sort()
log_files.extend(txt_files)
matched_logs = 0
for filename in log_files:
base_filename = os.path.basename(filename)
collection_name = os.path.basename(os.path.abspath(filename + '/../..'))
match = re.match(FILENAME_REGEX, base_filename)
if match is None: # not a channel
continue
matched_logs += 1
match = match.groupdict()
if match['channel2'] is not None:
channel = match['channel2']
year = int('20' + match['year2'])
else:
channel = match['channel1']
year = int('20' + match['year1'])
with open(filename) as f:
line_no = 1
log_filename = collection_name + '/' + network_name + '/' + base_filename
for line in f.readlines():
line = line.strip()
message_id = '%s-%s-%d' % (network_name, base_filename.replace('.log', '').replace('.txt', ''), line_no)
match = re.match(USER_MESSAGE_REGEX, line)
if match:
dict = match.groupdict()
urls_found = []
datetime = datetime_to_utc(date_parser.parse('%s %s %s' % (dict['date'], args.timezone, year)))
for word in dict['message'].split():
if 'http://' in word or 'https://' in word:
urls_found.append(word)
data = {
'irc_message': {
'_id': message_id,
'channel': channel,
'date_created': datetime.isoformat() + 'Z',
'is_slash_me': False,
'nick': dict['nick'],
'message': dict['message'],
'line_number': line_no,
'network': network_name,
'urls': urls_found,
'collection': collection_name,
'log_file': log_filename,
},
}
js = JSON.encode(data, sort_keys=True, indent=4, separators=(',', ': '))
print(js)
line_no += 1
if matched_logs == 0:
print('No logs matched in %s' % (path))
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment