Tatsh/parse-xchat-logs.py

## parse-xchat-logs.py
#!/usr/bin/env python

import argparse
import datetime
import glob
import json as JSON
import os
import re
import sys
from dateutil import parser as date_parser, tz

# Usage: ./parse-xchat-logs.py -t '-0500' DIRECTORY_TO_LOG_FILES
# X-Chat 2/HexChat log parser
# From Preferences > Logging (without quotes):
#   Log filename: '%n/%Y%m%d-%c.log'
#   Log timestamp foramt: '%b %d %H:%M:%S '

# This also supports the file name format: '%c-%y%m%d.log' (2-digit year suffix (after '20'), example: ##asm-071203.log)

FILENAME_REGEX = r'^(?:20(?P<year1>\d{2})\d{2}[^\-]+\-(?P<channel1>\#[^\.]+))|(?:(?P<channel2>\#[^\-]+)\-(?P<year2>\d{2}))'
USER_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+<(?P<nick>[^>]+)>\s+(?P<message>.*)'
CHANNEL_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+\*\t(?P<message>[^\ ]+)\s(.*)'

JSON.encode = JSON.dumps
JSON.decode = JSON.loads


def datetime_to_utc(date):
    """Returns date in UTC without tzinfo"""
    return date.astimezone(tz.gettz('UTC')).replace(tzinfo=None) if date.tzinfo else date

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Parser for X-Chat logs in default file name and log format')
    parser.add_argument('path', help='Path to directory of logs')
    parser.add_argument('-n', '--network-name', help='Network name')
    parser.add_argument('-t', '--timezone', help='Timezone', default='-0500')
    args = parser.parse_args()

    path = os.path.abspath(args.path)
    network_name = os.path.basename(path)

    if args.network_name:
        network_name = args.network_name

    log_files = glob.glob(path + '/*.log')
    txt_files = glob.glob(path + '/*.txt')

    log_files.sort()
    txt_files.sort()
    log_files.extend(txt_files)

    matched_logs = 0

    for filename in log_files:
        base_filename = os.path.basename(filename)
        collection_name = os.path.basename(os.path.abspath(filename + '/../..'))
        match = re.match(FILENAME_REGEX, base_filename)

        if match is None:  # not a channel
            continue

        matched_logs += 1

        match = match.groupdict()

        if match['channel2'] is not None:
            channel = match['channel2']
            year = int('20' + match['year2'])
        else:
            channel = match['channel1']
            year = int('20' + match['year1'])

        with open(filename) as f:
            line_no = 1
            log_filename = collection_name + '/' + network_name + '/' + base_filename

            for line in f.readlines():
                line = line.strip()
                message_id = '%s-%s-%d' % (network_name, base_filename.replace('.log', '').replace('.txt', ''), line_no)

                match = re.match(USER_MESSAGE_REGEX, line)

                if match:
                    dict = match.groupdict()
                    urls_found = []
                    datetime = datetime_to_utc(date_parser.parse('%s %s %s' % (dict['date'], args.timezone, year)))

                    for word in dict['message'].split():
                        if 'http://' in word or 'https://' in word:
                            urls_found.append(word)

                    data = {
                        'irc_message': {
                            '_id': message_id,
                            'channel': channel,
                            'date_created': datetime.isoformat() + 'Z',
                            'is_slash_me': False,
                            'nick': dict['nick'],
                            'message': dict['message'],
                            'line_number': line_no,
                            'network': network_name,
                            'urls': urls_found,
                            'collection': collection_name,
                            'log_file': log_filename,
                        },
                    }
                    js = JSON.encode(data, sort_keys=True, indent=4, separators=(',', ': '))
                    print(js)

                line_no += 1

    if matched_logs == 0:
        print('No logs matched in %s' % (path))
        sys.exit(1)
	#!/usr/bin/env python

	import argparse
	import datetime
	import glob
	import json as JSON
	import os
	import re
	import sys
	from dateutil import parser as date_parser, tz

	# Usage: ./parse-xchat-logs.py -t '-0500' DIRECTORY_TO_LOG_FILES
	# X-Chat 2/HexChat log parser
	# From Preferences > Logging (without quotes):
	# Log filename: '%n/%Y%m%d-%c.log'
	# Log timestamp foramt: '%b %d %H:%M:%S '

	# This also supports the file name format: '%c-%y%m%d.log' (2-digit year suffix (after '20'), example: ##asm-071203.log)

	FILENAME_REGEX = r'^(?:20(?P<year1>\d{2})\d{2}[^\-]+\-(?P<channel1>\#[^\.]+))\|(?:(?P<channel2>\#[^\-]+)\-(?P<year2>\d{2}))'
	USER_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+<(?P<nick>[^>]+)>\s+(?P<message>.*)'
	CHANNEL_MESSAGE_REGEX = r'(?P<date>^[A-Z][a-z]{2}\s+\d+\s+(?:\d{2}\:?){3})\s+\\t(?P<message>[^\ ]+)\s(.)'

	JSON.encode = JSON.dumps
	JSON.decode = JSON.loads


	def datetime_to_utc(date):
	"""Returns date in UTC without tzinfo"""
	return date.astimezone(tz.gettz('UTC')).replace(tzinfo=None) if date.tzinfo else date

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Parser for X-Chat logs in default file name and log format')
	parser.add_argument('path', help='Path to directory of logs')
	parser.add_argument('-n', '--network-name', help='Network name')
	parser.add_argument('-t', '--timezone', help='Timezone', default='-0500')
	args = parser.parse_args()

	path = os.path.abspath(args.path)
	network_name = os.path.basename(path)

	if args.network_name:
	network_name = args.network_name

	log_files = glob.glob(path + '/*.log')
	txt_files = glob.glob(path + '/*.txt')

	log_files.sort()
	txt_files.sort()
	log_files.extend(txt_files)

	matched_logs = 0

	for filename in log_files:
	base_filename = os.path.basename(filename)
	collection_name = os.path.basename(os.path.abspath(filename + '/../..'))
	match = re.match(FILENAME_REGEX, base_filename)

	if match is None: # not a channel
	continue

	matched_logs += 1

	match = match.groupdict()

	if match['channel2'] is not None:
	channel = match['channel2']
	year = int('20' + match['year2'])
	else:
	channel = match['channel1']
	year = int('20' + match['year1'])

	with open(filename) as f:
	line_no = 1
	log_filename = collection_name + '/' + network_name + '/' + base_filename

	for line in f.readlines():
	line = line.strip()
	message_id = '%s-%s-%d' % (network_name, base_filename.replace('.log', '').replace('.txt', ''), line_no)

	match = re.match(USER_MESSAGE_REGEX, line)

	if match:
	dict = match.groupdict()
	urls_found = []
	datetime = datetime_to_utc(date_parser.parse('%s %s %s' % (dict['date'], args.timezone, year)))

	for word in dict['message'].split():
	if 'http://' in word or 'https://' in word:
	urls_found.append(word)

	data = {
	'irc_message': {
	'_id': message_id,
	'channel': channel,
	'date_created': datetime.isoformat() + 'Z',
	'is_slash_me': False,
	'nick': dict['nick'],
	'message': dict['message'],
	'line_number': line_no,
	'network': network_name,
	'urls': urls_found,
	'collection': collection_name,
	'log_file': log_filename,
	},
	}
	js = JSON.encode(data, sort_keys=True, indent=4, separators=(',', ': '))
	print(js)

	line_no += 1

	if matched_logs == 0:
	print('No logs matched in %s' % (path))
	sys.exit(1)