marpie/parseMIPSpdf.py

## parseMIPSpdf.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" parseMIPSpdf.py

    Uses PyPDF2 to parse the MIPS Instruction documentation and creates
    a Ghidra compatible idx.

    PDF Sources:

        https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00087-2B-MIPS64BIS-AFP-6.06.pdf
        https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00594-2B-microMIPS64-AFP-6.05.pdf
        https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf

    Command Line:

        parseMIPSpdf.py MD00087-2B-MIPS64BIS-AFP-6.06.pdf 44 517 > pages_MD00087.txt
        parseMIPSpdf.py MD00594-2B-microMIPS64-AFP-6.05.pdf 69 452 > pages_MD00594.txt
        parseMIPSpdf.py MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf 65 368 > pages_MD00582.txt

    Hints:

        After running the script, inspect StdErr. Special care should be taken for
        the following errors:

            - WARNING:root:Content scraping failed for page: XX
                This means, that PyPDF2 returned no content object and the page
                needs to be inspected by the user and manually added to the output.
                Most likely the page is embedded as an image - this is the case for
                several pages in MD00087 and MD00594.

    Author: marpie (marpie@a12d404.net)

    Last Update:  20190511
    Created:      20160511

"""
from __future__ import print_function

import sys
import typing
import logging

try:
    import PyPDF2
except ModuleNotFoundError:
    sys.stderr.write("PyPDF2 missing! Install via: pip3 install PyPDF2\n")
    sys.exit(1)

# Version Information
__version__ = "0.0.1"
__program__ = "parseMIPSpdf v" + __version__
__author__ = "marpie"
__email__ = "marpie+parseMIPSpdf@a12d404.net"
__license__ = "BSD License"
__copyright__ = "Copyright 2019, a12d404.net"
# ("Prototype", "Development", "Testing", "Production")
__status__ = "Prototype"

LOG_LEVEL = logging.WARNING

# Source: MD00087-2B-MIPS64BIS-AFP-6.06.pdf
COND_TABLE = [
    # Table 3.9 Comparing CMP.condn.fmt, IEEE 754-2008, C.cond.fmt, and MSA FP compares
    'f',
    'af',
    't',
    'at',

    'un',
    'or',

    'eq',
    'neq',
    'une',

    'ueq',
    'ogl',
    'ne',

    'olt',
    'lt',
    'uge',

    'ult',
    'oge',

    'ole',
    'le',
    'ugt',

    'ule',
    'ogt',

    # Table 3.2 FPU Comparisons With Special Operand Exceptions for QNaNs
    'sf',
    'saf',
    'st',
    'sat',

    'ngle',
    'sun',
    'gle',
    'sor',

    'seq',
    'sne',
    'sune',

    'ngl',
    'sueq',
    'gl',
    'sne',

    'lt',
    'slt',
    'nlt',
    'suge',

    'nge',
    'sult',
    'ge',
    'soge',

    'le',
    'sle',
    'nle',
    'sugt',

    'ngt',
    'sule',
    'gt',
    'sogt',
]

#SCRIPT_PATH = os.path.dirname( os.path.realpath( __file__ ) )


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def extractTextLines(pdf: PyPDF2.PdfFileReader, page: PyPDF2.pdf.PageObject) -> typing.List[str]:
    """
    Locate all text drawing commands, in the order they are provided in the
    content stream, and extract the text.  This works well for some PDF
    files, but poorly for others, depending on the generator used.  This will
    be refined in the future.  Do not rely on the order of text coming out of
    this function, as it will change if this function is made more
    sophisticated.

    :return: a unicode string object.

    Source & (c): <PyPDF2.pdf>
    """
    logging.info("Func: extractTextLines()")
    text = []
    content = page.getContents()
    if not content:
        return None
    if not isinstance(content, PyPDF2.pdf.ContentStream):
        content = PyPDF2.pdf.ContentStream(content, pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        #logging.debug("operands ({}): {}".format(type(operands), repr(operands)))
        #logging.debug("operator ({}): {}".format(type(operator), repr(operator)))
        if operator == b"Tj":
            _text = operands[0]
            if isinstance(_text, PyPDF2.generic.TextStringObject):
                text.append(_text)
        elif operator == b"T*":
            text.append("")
        elif operator == b"'":
            text.append("")
            _text = operands[0]
            if isinstance(_text, PyPDF2.generic.TextStringObject):
                text.append(operands[0])
        elif operator == b'"':
            _text = operands[2]
            if isinstance(_text, PyPDF2.generic.TextStringObject):
                text.append(_text)
        elif operator == b"TJ":
            __text = ""
            for i in operands[0]:
                if isinstance(i, PyPDF2.generic.TextStringObject):
                    __text += i
            text.append(__text)
    return text


def parseFunctionTitle(title: str) -> typing.List[str]:
    """parseFunctions uses the extracted text to return the functions explained
    on that page. The returned functions can contain ".fmt" or other aliases.
    """
    logging.info("Func: parseFunctions(title: '{}')".format(title))
    functs = []
    if len(title) < 1:
        return []
    elif ('{' in title) and ('}' in title):
        # Sample: "b{le,ge,gt,lt,eq,ne}zalc"
        first, rest = title.split('{', 1)
        rest, last = rest.split('}', 1)
        functs = [
            "{}{}{}".format(first, part.strip(), last).lower()
            for part in rest.split(',')
        ]
    elif (" " in title) and (not "," in title):
        # Sample: "aui daui dahi dati"
        functs = [part.lower() for part in title.split(' ') if part]
    elif ("," in title):
        # Sample: "crc32b, crc32h, crc32w, crc32d"
        functs = [part.strip().lower()
                  for part in title.split(',') if part.strip()]
    else:
        functs = [title.lower()]
    return functs


def extractFmtFunctions(function, parsed):
    """extractFmtFunctions creates every possible format for the given function."""
    logging.info("Func: extractFmtFunctions()")
    functionBase = function[:-3].lower()
    logging.debug("functionBase={}".format(repr(functionBase)))
    functions = []
    for line in parsed:
        line = line.lower()
        if line.startswith(functionBase):
            logging.debug("line -> {}".format(repr(line)))
            functions.append(line.split(" ", 1)[0])
    return functions


def prepareCondFunctions(function: str) -> typing.List[str]:
    """prepareCondFunctions returns all conditional combinations of the function."""
    logging.info("Func: prepareCondFunctions()")
    functions = []
    for cond in COND_TABLE:
        func = function \
            .replace(".cond.", ".{}.".format(cond)) \
            .replace("<cond>", ".{}.".format(cond)) \
            .replace(".condn.", ".{}.".format(cond))
        functions.append(func)
    return functions

def isConditionalFunction(func: str) -> bool:
    return (".cond." in func) or ("<cond>" in func) or (".condn." in func)

def parseFunctionsUsingFormatSection(pageNo: int, lines: typing.List[str]) -> typing.List[str]:
    # State-Machine:
    #   0 = searching format-section
    #   1 = function title
    #   2 = function entries for ".fmt" and ".cond" pages
    state = 0
    preParsedFunctions = []
    rawLines = []
    for line in lines:
        line = line.lower().strip()
        if len(line) < 1:
            continue
        if state == 0:
            if "format:" in line:
                logging.debug("state=1")
                state = 1
                continue
        elif state == 1:
            logging.debug("state before 2")
            preParsedFunctions = parseFunctionTitle(line)
            if len(preParsedFunctions) < 1:
                logging.error("No functions found in format section!")
            logging.debug("state=2")
            state = 2
            logging.debug("preParsedFunctions(len={})={}".format(
                len(preParsedFunctions),
                repr(preParsedFunctions)
            ))
        elif state == 2:
            if ":" in line:
                # we reached a new section, bail!
                logging.debug("new section!")
                break
            rawLines.append(line)
    logging.debug("rawLines(len={})={}".format(
        len(rawLines),
        repr(rawLines)
    ))
    if len(preParsedFunctions) < 1:
        logging.debug("quick-exit--1")
        return []
    if (len(rawLines) < 2) and len(preParsedFunctions) > 1:
        logging.debug("quick-exit--2")
        function = preParsedFunctions[0].split(" ")[0]
        if isConditionalFunction(function):
            return prepareCondFunctions(function)
        return [function]
    elif (".fmt" not in preParsedFunctions[0]) and (preParsedFunctions[0] not in rawLines[0]):
        logging.debug("quick-exit--3")
        return [preParsedFunctions[0].split(" ")[0]]
    functions = []
    # resolve .fmt
    for function in preParsedFunctions:
        logging.debug("function - {}".format(function))
        if function.endswith(".fmt"):
            functions += extractFmtFunctions(function, rawLines)
        else:
            functions.append(function)
    functionsPreParsed = functions
    functions = []
    # resolve .cond.
    for function in functionsPreParsed:
        if isConditionalFunction(function):
            functions += prepareCondFunctions(function)
        else:
            functions.append(function)
    return functions


def parsePage(pdfReader: PyPDF2.PdfFileReader, pageNo: int) -> typing.Tuple[bool, str]:
    logging.info("Func: parsePage(pageNo: {})".format(pageNo))
    pageObj = pdfReader.getPage(pageNo-1)
    try:
        parsed = extractTextLines(pdfReader, pageObj)
        #logging.debug("extractTextLines: Result({} lines): {}".format(len(parsed), repr(parsed)))
        if parsed is None:
            logging.warning("Content scraping failed for page: {}".format(pageNo))
        if (not parsed) or len(parsed) < 2:
            return False, None
    except:
        return False, None
    functions = parseFunctionsUsingFormatSection(pageNo, parsed)
    if len(functions) < 1:
        return False, None
    return True, [(functionName.lower(), pageNo,) for functionName in functions]


def parsePdf(filename: str, startPage: int, endPage: int) -> bool:
    """parsePdf parses the instruction pages and prints the index."""
    logging.info("Func: parsePdf()")
    alreadyParsedFunctions = []
    with open(filename, 'rb') as pdfFileObj:
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        try:
            # iterate instruction pages
            for pageNo in range(startPage, endPage+1):
                res, out = parsePage(pdfReader, pageNo)
                #res, out = parsePage(pdfReader, 51)
                if not res:
                    eprint("[E] Parsing failed for page {}!".format(pageNo))
                    #raise Exception("breakout1!")
                    continue
                # out can be None if the functions on the page were already
                # parsed by a previous page.
                for func, pageNo in out:
                    if not func in alreadyParsedFunctions:
                        alreadyParsedFunctions.append(func)
                        print('{},{}'.format(func, pageNo))
                #raise Exception("breakout2!")
        finally:
            pdfFileObj.close()


def main(argv: list):
    logging.basicConfig(level=LOG_LEVEL)
    if len(argv) != 4:
        eprint("{} [pdf-file] [start-page] [end-page]".format(argv[0]))
        return False

    filename = argv[1]
    startPage = int(argv[2])
    endPage = int(argv[3])

    return parsePdf(filename, startPage, endPage)


if __name__ == "__main__":
    import sys
    sys.exit(not main(sys.argv))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	""" parseMIPSpdf.py

	Uses PyPDF2 to parse the MIPS Instruction documentation and creates
	a Ghidra compatible idx.

	PDF Sources:

	https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00087-2B-MIPS64BIS-AFP-6.06.pdf
	https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00594-2B-microMIPS64-AFP-6.05.pdf
	https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf

	Command Line:

	parseMIPSpdf.py MD00087-2B-MIPS64BIS-AFP-6.06.pdf 44 517 > pages_MD00087.txt
	parseMIPSpdf.py MD00594-2B-microMIPS64-AFP-6.05.pdf 69 452 > pages_MD00594.txt
	parseMIPSpdf.py MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf 65 368 > pages_MD00582.txt

	Hints:

	After running the script, inspect StdErr. Special care should be taken for
	the following errors:

	- WARNING:root:Content scraping failed for page: XX
	This means, that PyPDF2 returned no content object and the page
	needs to be inspected by the user and manually added to the output.
	Most likely the page is embedded as an image - this is the case for
	several pages in MD00087 and MD00594.

	Author: marpie (marpie@a12d404.net)

	Last Update: 20190511
	Created: 20160511

	"""
	from __future__ import print_function

	import sys
	import typing
	import logging

	try:
	import PyPDF2
	except ModuleNotFoundError:
	sys.stderr.write("PyPDF2 missing! Install via: pip3 install PyPDF2\n")
	sys.exit(1)

	# Version Information
	__version__ = "0.0.1"
	__program__ = "parseMIPSpdf v" + __version__
	__author__ = "marpie"
	__email__ = "marpie+parseMIPSpdf@a12d404.net"
	__license__ = "BSD License"
	__copyright__ = "Copyright 2019, a12d404.net"
	# ("Prototype", "Development", "Testing", "Production")
	__status__ = "Prototype"

	LOG_LEVEL = logging.WARNING

	# Source: MD00087-2B-MIPS64BIS-AFP-6.06.pdf
	COND_TABLE = [
	# Table 3.9 Comparing CMP.condn.fmt, IEEE 754-2008, C.cond.fmt, and MSA FP compares
	'f',
	'af',
	't',
	'at',

	'un',
	'or',

	'eq',
	'neq',
	'une',

	'ueq',
	'ogl',
	'ne',

	'olt',
	'lt',
	'uge',

	'ult',
	'oge',

	'ole',
	'le',
	'ugt',

	'ule',
	'ogt',

	# Table 3.2 FPU Comparisons With Special Operand Exceptions for QNaNs
	'sf',
	'saf',
	'st',
	'sat',

	'ngle',
	'sun',
	'gle',
	'sor',

	'seq',
	'sne',
	'sune',

	'ngl',
	'sueq',
	'gl',
	'sne',

	'lt',
	'slt',
	'nlt',
	'suge',

	'nge',
	'sult',
	'ge',
	'soge',

	'le',
	'sle',
	'nle',
	'sugt',

	'ngt',
	'sule',
	'gt',
	'sogt',
	]

	#SCRIPT_PATH = os.path.dirname( os.path.realpath( __file__ ) )


	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)


	def extractTextLines(pdf: PyPDF2.PdfFileReader, page: PyPDF2.pdf.PageObject) -> typing.List[str]:
	"""
	Locate all text drawing commands, in the order they are provided in the
	content stream, and extract the text. This works well for some PDF
	files, but poorly for others, depending on the generator used. This will
	be refined in the future. Do not rely on the order of text coming out of
	this function, as it will change if this function is made more
	sophisticated.

	:return: a unicode string object.

	Source & (c): <PyPDF2.pdf>
	"""
	logging.info("Func: extractTextLines()")
	text = []
	content = page.getContents()
	if not content:
	return None
	if not isinstance(content, PyPDF2.pdf.ContentStream):
	content = PyPDF2.pdf.ContentStream(content, pdf)
	# Note: we check all strings are TextStringObjects. ByteStringObjects
	# are strings where the byte->string encoding was unknown, so adding
	# them to the text here would be gibberish.
	for operands, operator in content.operations:
	#logging.debug("operands ({}): {}".format(type(operands), repr(operands)))
	#logging.debug("operator ({}): {}".format(type(operator), repr(operator)))
	if operator == b"Tj":
	_text = operands[0]
	if isinstance(_text, PyPDF2.generic.TextStringObject):
	text.append(_text)
	elif operator == b"T*":
	text.append("")
	elif operator == b"'":
	text.append("")
	_text = operands[0]
	if isinstance(_text, PyPDF2.generic.TextStringObject):
	text.append(operands[0])
	elif operator == b'"':
	_text = operands[2]
	if isinstance(_text, PyPDF2.generic.TextStringObject):
	text.append(_text)
	elif operator == b"TJ":
	__text = ""
	for i in operands[0]:
	if isinstance(i, PyPDF2.generic.TextStringObject):
	__text += i
	text.append(__text)
	return text


	def parseFunctionTitle(title: str) -> typing.List[str]:
	"""parseFunctions uses the extracted text to return the functions explained
	on that page. The returned functions can contain ".fmt" or other aliases.
	"""
	logging.info("Func: parseFunctions(title: '{}')".format(title))
	functs = []
	if len(title) < 1:
	return []
	elif ('{' in title) and ('}' in title):
	# Sample: "b{le,ge,gt,lt,eq,ne}zalc"
	first, rest = title.split('{', 1)
	rest, last = rest.split('}', 1)
	functs = [
	"{}{}{}".format(first, part.strip(), last).lower()
	for part in rest.split(',')
	]
	elif (" " in title) and (not "," in title):
	# Sample: "aui daui dahi dati"
	functs = [part.lower() for part in title.split(' ') if part]
	elif ("," in title):
	# Sample: "crc32b, crc32h, crc32w, crc32d"
	functs = [part.strip().lower()
	for part in title.split(',') if part.strip()]
	else:
	functs = [title.lower()]
	return functs


	def extractFmtFunctions(function, parsed):
	"""extractFmtFunctions creates every possible format for the given function."""
	logging.info("Func: extractFmtFunctions()")
	functionBase = function[:-3].lower()
	logging.debug("functionBase={}".format(repr(functionBase)))
	functions = []
	for line in parsed:
	line = line.lower()
	if line.startswith(functionBase):
	logging.debug("line -> {}".format(repr(line)))
	functions.append(line.split(" ", 1)[0])
	return functions


	def prepareCondFunctions(function: str) -> typing.List[str]:
	"""prepareCondFunctions returns all conditional combinations of the function."""
	logging.info("Func: prepareCondFunctions()")
	functions = []
	for cond in COND_TABLE:
	func = function \
	.replace(".cond.", ".{}.".format(cond)) \
	.replace("<cond>", ".{}.".format(cond)) \
	.replace(".condn.", ".{}.".format(cond))
	functions.append(func)
	return functions

	def isConditionalFunction(func: str) -> bool:
	return (".cond." in func) or ("<cond>" in func) or (".condn." in func)

	def parseFunctionsUsingFormatSection(pageNo: int, lines: typing.List[str]) -> typing.List[str]:
	# State-Machine:
	# 0 = searching format-section
	# 1 = function title
	# 2 = function entries for ".fmt" and ".cond" pages
	state = 0
	preParsedFunctions = []
	rawLines = []
	for line in lines:
	line = line.lower().strip()
	if len(line) < 1:
	continue
	if state == 0:
	if "format:" in line:
	logging.debug("state=1")
	state = 1
	continue
	elif state == 1:
	logging.debug("state before 2")
	preParsedFunctions = parseFunctionTitle(line)
	if len(preParsedFunctions) < 1:
	logging.error("No functions found in format section!")
	logging.debug("state=2")
	state = 2
	logging.debug("preParsedFunctions(len={})={}".format(
	len(preParsedFunctions),
	repr(preParsedFunctions)
	))
	elif state == 2:
	if ":" in line:
	# we reached a new section, bail!
	logging.debug("new section!")
	break
	rawLines.append(line)
	logging.debug("rawLines(len={})={}".format(
	len(rawLines),
	repr(rawLines)
	))
	if len(preParsedFunctions) < 1:
	logging.debug("quick-exit--1")
	return []
	if (len(rawLines) < 2) and len(preParsedFunctions) > 1:
	logging.debug("quick-exit--2")
	function = preParsedFunctions[0].split(" ")[0]
	if isConditionalFunction(function):
	return prepareCondFunctions(function)
	return [function]
	elif (".fmt" not in preParsedFunctions[0]) and (preParsedFunctions[0] not in rawLines[0]):
	logging.debug("quick-exit--3")
	return [preParsedFunctions[0].split(" ")[0]]
	functions = []
	# resolve .fmt
	for function in preParsedFunctions:
	logging.debug("function - {}".format(function))
	if function.endswith(".fmt"):
	functions += extractFmtFunctions(function, rawLines)
	else:
	functions.append(function)
	functionsPreParsed = functions
	functions = []
	# resolve .cond.
	for function in functionsPreParsed:
	if isConditionalFunction(function):
	functions += prepareCondFunctions(function)
	else:
	functions.append(function)
	return functions


	def parsePage(pdfReader: PyPDF2.PdfFileReader, pageNo: int) -> typing.Tuple[bool, str]:
	logging.info("Func: parsePage(pageNo: {})".format(pageNo))
	pageObj = pdfReader.getPage(pageNo-1)
	try:
	parsed = extractTextLines(pdfReader, pageObj)
	#logging.debug("extractTextLines: Result({} lines): {}".format(len(parsed), repr(parsed)))
	if parsed is None:
	logging.warning("Content scraping failed for page: {}".format(pageNo))
	if (not parsed) or len(parsed) < 2:
	return False, None
	except:
	return False, None
	functions = parseFunctionsUsingFormatSection(pageNo, parsed)
	if len(functions) < 1:
	return False, None
	return True, [(functionName.lower(), pageNo,) for functionName in functions]


	def parsePdf(filename: str, startPage: int, endPage: int) -> bool:
	"""parsePdf parses the instruction pages and prints the index."""
	logging.info("Func: parsePdf()")
	alreadyParsedFunctions = []
	with open(filename, 'rb') as pdfFileObj:
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	try:
	# iterate instruction pages
	for pageNo in range(startPage, endPage+1):
	res, out = parsePage(pdfReader, pageNo)
	#res, out = parsePage(pdfReader, 51)
	if not res:
	eprint("[E] Parsing failed for page {}!".format(pageNo))
	#raise Exception("breakout1!")
	continue
	# out can be None if the functions on the page were already
	# parsed by a previous page.
	for func, pageNo in out:
	if not func in alreadyParsedFunctions:
	alreadyParsedFunctions.append(func)
	print('{},{}'.format(func, pageNo))
	#raise Exception("breakout2!")
	finally:
	pdfFileObj.close()


	def main(argv: list):
	logging.basicConfig(level=LOG_LEVEL)
	if len(argv) != 4:
	eprint("{} [pdf-file] [start-page] [end-page]".format(argv[0]))
	return False

	filename = argv[1]
	startPage = int(argv[2])
	endPage = int(argv[3])

	return parsePdf(filename, startPage, endPage)


	if __name__ == "__main__":
	import sys
	sys.exit(not main(sys.argv))