JodiTheTigger/gdbBacktraceToJson.py

## gdbBacktraceToJson.py
#!/usr/bin/python2
#
# gdbBacktraceToJson.py.  Parses gdb backtraces into json.
# Copyright (C) 2014 Richard Maxwell <jodi.the.tigger@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>
#
# Description:
# gdbBacktraceToJson.py parses the output of the command "thread apply all bt full" and turns it into a json array.
# Useful for automating the analysis of coredump files generated when an application crashes. Use the tokenised json
# to seach a database of crashes for similar crashes or make a nice web interface for viewing back traces. You could
# make a backtrace diff tool. It's much easier to use and write tools using a standard data format.

# Usage:
# python2 gdbBacktraceToJson.py <backtrace file>
# It will parse the file and output the backtrace as a json array to std out.
# you can get the backtrace file from a core dump file by running gdb in the following way:
# gdb [app with debug symbols] [core file] --eval-command "thread apply all bt full" --eval-command "quit" > mybacktrace.txt

import sys
import os
import re
import json
import string
import datetime

def parseLocals(lines):
    result = {}

    index = 0
    while index < len(lines):
        simpleVars = re.match( r'\s*(.*)\s=\s(.*)\s*', lines[index], re.I|re.M)

        if lines[index].find('{') == -1:

            if simpleVars:
                result[simpleVars.group(1)] = simpleVars.group(2).strip().strip(',')

        else:
            # find the closing brace.
            closingIndex = index + 1
            closingIndexFound = -1
            depth = 1
            while closingIndex < len(lines):
                closingIndexFound = closingIndex

                if lines[closingIndex].find('{') != -1:
                    depth = depth + 1
                else:
                    if lines[closingIndex].find('}') != -1:
                        depth = depth - 1

                        if depth < 1:
                            closingIndexFound = closingIndex
                            break

                closingIndex = closingIndex + 1

            if closingIndexFound == -1:
                # wtf?
                print "*ERROR* Coreline: parseLocals: Can't find closing brace."
                return result

            # deal with nested braces using recursion.
            joinedLines = '\n'.join(lines[index+1:closingIndex])

            if simpleVars:
                result[simpleVars.group(1)] = parseLocals(lines[index+1:closingIndex])
                index = closingIndex
            else:
                return result

        index = index + 1

    return result

def coreLinesToObject(coreLine):
    coreObject = {}

    # line format is:
    # #frame [0x12345678] in (<function>) [from|at] [library|file]
    # (?:....) means don't capture that group (?:)
    matchResult = re.match( r'\#(\d+)\s+(?:(0x(?:[0-9A-F])*) in |)(\S+) (\((?:.|\n|\r)*\))(?: (?:at|from) (.*)|$)', coreLine, re.I|re.M)

    if matchResult:
        # matches are:
        # 1: frame
        # 2: address or no match
        # 3: function name
        # 4: argument list (including braces)
        # 5: source / library
        coreObject['frame'] = matchResult.group(1)
        coreObject['address'] = matchResult.group(2)
        coreObject['function'] = matchResult.group(3)
        coreObject['source'] = matchResult.group(5)
        coreObject['arguments'] = {}

        # right, parse in the argument list
        # arguments can have the @ symbol in them 'this@entry=0x12345678'
        argSearch = re.findall( r'([\w@]+)=(\w+|<optimized out>)', matchResult.group(4), re.I|re.M)
        for (argKey, argValue) in argSearch:
            coreObject['arguments'][argKey] = argValue

        # bt full stuff will come here. Stack variables and source files too.
        arguments = coreLine.split('\n')[1:]
        if len(arguments) > 1:
            if coreObject['source'] == None:
                sourceMatch = re.match( r'\s+(?:at|from) (.*)\w', arguments[0], re.I|re.M)

                if sourceMatch:
                    coreObject['source'] = sourceMatch.group(1)

            # parse the arguments.
            coreObject['locals'] = parseLocals(arguments[1:])

    else:
        # really should complain.
        print "*ERROR* Coreline mismatch: ", coreLine

    return coreObject

def textToList(filePath, fileText):
    core = {}
    core['filePath'] = filePath
    core['fileName'] = os.path.splitext(os.path.basename(filePath))[0]
    core['threads'] = []
    core['jsonCreationTimeUtc'] = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

    threadIndex = -1;
    threadId = ""
    multipleLines = ""

    for line in fileText:
        # Search for core dump global meta
        # (command line and termination reason)
        # Core was generated by `.....'.
        # Program terminated with ...
        # ---------------------------------------
        if not core.has_key('commandLine'):
            if line.find("Core was generated by") == 0:
                # [23:-3] manually deduced so I can keep what's in quotes
                # If I did it properly I would use a regex.
                core['commandLine'] = line[23:-3]

        if not core.has_key('coreReason'):
            if line.find("Program terminated with") == 0:
                # [:-1] remove line ending
                core['coreReason'] = line[:-1]


        # Parse core dumps per thread.
        # ---------------------------------------
        if line.find("Thread")== 0:
            #right, make sure we purge the last line of the last stack trace please.
            if len(multipleLines) > 0:
                core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                multipleLines = "";

            threadIndex += 1

            threadId = line[:-2]
            core['threads'].append({})
            core['threads'][threadIndex]['stackTrace'] = []

            threadResult = re.match( r'Thread\s+(\d+)\s+\(LWP\s+(\d+)\)', threadId, re.I|re.M)
            if threadResult:
                core['threads'][threadIndex]['threadId'] = threadResult.group(2)
                core['threads'][threadIndex]['threadNumber'] = threadResult.group(1)
            else:
                print "*ERROR* ThreadId mismatch: ", threadId
                core['threads'][threadIndex]['threadId'] = threadId

        else:
            if threadIndex > -1:
                if len(line) > 0:
                    if len(multipleLines) > 0:
                        if line[0] == '#':
                            core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                            multipleLines = line;
                        else:
                            multipleLines += line;
                    else:
                        if (line[0] == '#'):
                            multipleLines = line;
                else:
                    if len(multipleLines) > 0:
                        core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
                        multipleLines = "";

    return core


# the filename is the name of the textual output of gdb's "thread apply all bt"
def process(argList):
    fileName = argList[1]

    coreDump = open(fileName, 'r')
    lines = coreDump.readlines()
    coreDump.close()

    coreDumpObject = textToList(fileName, lines)

    # right, dump the json
    print json.dumps(coreDumpObject, sort_keys=True, indent=4)

# decode the first passed filename
process(sys.argv)
	#!/usr/bin/python2
	#
	# gdbBacktraceToJson.py. Parses gdb backtraces into json.
	# Copyright (C) 2014 Richard Maxwell <jodi.the.tigger@gmail.com>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>
	#
	# Description:
	# gdbBacktraceToJson.py parses the output of the command "thread apply all bt full" and turns it into a json array.
	# Useful for automating the analysis of coredump files generated when an application crashes. Use the tokenised json
	# to seach a database of crashes for similar crashes or make a nice web interface for viewing back traces. You could
	# make a backtrace diff tool. It's much easier to use and write tools using a standard data format.

	# Usage:
	# python2 gdbBacktraceToJson.py <backtrace file>
	# It will parse the file and output the backtrace as a json array to std out.
	# you can get the backtrace file from a core dump file by running gdb in the following way:
	# gdb [app with debug symbols] [core file] --eval-command "thread apply all bt full" --eval-command "quit" > mybacktrace.txt

	import sys
	import os
	import re
	import json
	import string
	import datetime

	def parseLocals(lines):
	result = {}

	index = 0
	while index < len(lines):
	simpleVars = re.match( r'\s(.)\s=\s(.)\s', lines[index], re.I\|re.M)

	if lines[index].find('{') == -1:

	if simpleVars:
	result[simpleVars.group(1)] = simpleVars.group(2).strip().strip(',')

	else:
	# find the closing brace.
	closingIndex = index + 1
	closingIndexFound = -1
	depth = 1
	while closingIndex < len(lines):
	closingIndexFound = closingIndex

	if lines[closingIndex].find('{') != -1:
	depth = depth + 1
	else:
	if lines[closingIndex].find('}') != -1:
	depth = depth - 1

	if depth < 1:
	closingIndexFound = closingIndex
	break

	closingIndex = closingIndex + 1

	if closingIndexFound == -1:
	# wtf?
	print "ERROR Coreline: parseLocals: Can't find closing brace."
	return result

	# deal with nested braces using recursion.
	joinedLines = '\n'.join(lines[index+1:closingIndex])

	if simpleVars:
	result[simpleVars.group(1)] = parseLocals(lines[index+1:closingIndex])
	index = closingIndex
	else:
	return result

	index = index + 1

	return result

	def coreLinesToObject(coreLine):
	coreObject = {}

	# line format is:
	# #frame [0x12345678] in (<function>) [from\|at] [library\|file]
	# (?:....) means don't capture that group (?:)
	matchResult = re.match( r'\#(\d+)\s+(?:(0x(?:[0-9A-F])) in \|)(\S+) (\((?:.\|\n\|\r)\))(?: (?:at\|from) (.*)\|$)', coreLine, re.I\|re.M)

	if matchResult:
	# matches are:
	# 1: frame
	# 2: address or no match
	# 3: function name
	# 4: argument list (including braces)
	# 5: source / library
	coreObject['frame'] = matchResult.group(1)
	coreObject['address'] = matchResult.group(2)
	coreObject['function'] = matchResult.group(3)
	coreObject['source'] = matchResult.group(5)
	coreObject['arguments'] = {}

	# right, parse in the argument list
	# arguments can have the @ symbol in them 'this@entry=0x12345678'
	argSearch = re.findall( r'([\w@]+)=(\w+\|<optimized out>)', matchResult.group(4), re.I\|re.M)
	for (argKey, argValue) in argSearch:
	coreObject['arguments'][argKey] = argValue

	# bt full stuff will come here. Stack variables and source files too.
	arguments = coreLine.split('\n')[1:]
	if len(arguments) > 1:
	if coreObject['source'] == None:
	sourceMatch = re.match( r'\s+(?:at\|from) (.*)\w', arguments[0], re.I\|re.M)

	if sourceMatch:
	coreObject['source'] = sourceMatch.group(1)

	# parse the arguments.
	coreObject['locals'] = parseLocals(arguments[1:])

	else:
	# really should complain.
	print "ERROR Coreline mismatch: ", coreLine

	return coreObject

	def textToList(filePath, fileText):
	core = {}
	core['filePath'] = filePath
	core['fileName'] = os.path.splitext(os.path.basename(filePath))[0]
	core['threads'] = []
	core['jsonCreationTimeUtc'] = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

	threadIndex = -1;
	threadId = ""
	multipleLines = ""

	for line in fileText:
	# Search for core dump global meta
	# (command line and termination reason)
	# Core was generated by `.....'.
	# Program terminated with ...
	# ---------------------------------------
	if not core.has_key('commandLine'):
	if line.find("Core was generated by") == 0:
	# [23:-3] manually deduced so I can keep what's in quotes
	# If I did it properly I would use a regex.
	core['commandLine'] = line[23:-3]

	if not core.has_key('coreReason'):
	if line.find("Program terminated with") == 0:
	# [:-1] remove line ending
	core['coreReason'] = line[:-1]


	# Parse core dumps per thread.
	# ---------------------------------------
	if line.find("Thread")== 0:
	#right, make sure we purge the last line of the last stack trace please.
	if len(multipleLines) > 0:
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = "";

	threadIndex += 1

	threadId = line[:-2]
	core['threads'].append({})
	core['threads'][threadIndex]['stackTrace'] = []

	threadResult = re.match( r'Thread\s+(\d+)\s+\(LWP\s+(\d+)\)', threadId, re.I\|re.M)
	if threadResult:
	core['threads'][threadIndex]['threadId'] = threadResult.group(2)
	core['threads'][threadIndex]['threadNumber'] = threadResult.group(1)
	else:
	print "ERROR ThreadId mismatch: ", threadId
	core['threads'][threadIndex]['threadId'] = threadId

	else:
	if threadIndex > -1:
	if len(line) > 0:
	if len(multipleLines) > 0:
	if line[0] == '#':
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = line;
	else:
	multipleLines += line;
	else:
	if (line[0] == '#'):
	multipleLines = line;
	else:
	if len(multipleLines) > 0:
	core['threads'][threadIndex]['stackTrace'].append(coreLinesToObject(multipleLines))
	multipleLines = "";

	return core


	# the filename is the name of the textual output of gdb's "thread apply all bt"
	def process(argList):
	fileName = argList[1]

	coreDump = open(fileName, 'r')
	lines = coreDump.readlines()
	coreDump.close()

	coreDumpObject = textToList(fileName, lines)

	# right, dump the json
	print json.dumps(coreDumpObject, sort_keys=True, indent=4)

	# decode the first passed filename
	process(sys.argv)