Skip to content

Instantly share code, notes, and snippets.

@marpie
Created May 11, 2019 09:57
Show Gist options
  • Save marpie/a5973afe0cbc16be351e262cf22ad72f to your computer and use it in GitHub Desktop.
Save marpie/a5973afe0cbc16be351e262cf22ad72f to your computer and use it in GitHub Desktop.
Uses PyPDF2 to parse the MIPS Architecture manuals and creates Ghidra compatible idx entries.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" parseMIPSpdf.py
Uses PyPDF2 to parse the MIPS Instruction documentation and creates
a Ghidra compatible idx.
PDF Sources:
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00087-2B-MIPS64BIS-AFP-6.06.pdf
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00594-2B-microMIPS64-AFP-6.05.pdf
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf
Command Line:
parseMIPSpdf.py MD00087-2B-MIPS64BIS-AFP-6.06.pdf 44 517 > pages_MD00087.txt
parseMIPSpdf.py MD00594-2B-microMIPS64-AFP-6.05.pdf 69 452 > pages_MD00594.txt
parseMIPSpdf.py MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf 65 368 > pages_MD00582.txt
Hints:
After running the script, inspect StdErr. Special care should be taken for
the following errors:
- WARNING:root:Content scraping failed for page: XX
This means, that PyPDF2 returned no content object and the page
needs to be inspected by the user and manually added to the output.
Most likely the page is embedded as an image - this is the case for
several pages in MD00087 and MD00594.
Author: marpie (marpie@a12d404.net)
Last Update: 20190511
Created: 20160511
"""
from __future__ import print_function
import sys
import typing
import logging
try:
import PyPDF2
except ModuleNotFoundError:
sys.stderr.write("PyPDF2 missing! Install via: pip3 install PyPDF2\n")
sys.exit(1)
# Version Information
__version__ = "0.0.1"
__program__ = "parseMIPSpdf v" + __version__
__author__ = "marpie"
__email__ = "marpie+parseMIPSpdf@a12d404.net"
__license__ = "BSD License"
__copyright__ = "Copyright 2019, a12d404.net"
# ("Prototype", "Development", "Testing", "Production")
__status__ = "Prototype"
LOG_LEVEL = logging.WARNING
# Source: MD00087-2B-MIPS64BIS-AFP-6.06.pdf
COND_TABLE = [
# Table 3.9 Comparing CMP.condn.fmt, IEEE 754-2008, C.cond.fmt, and MSA FP compares
'f',
'af',
't',
'at',
'un',
'or',
'eq',
'neq',
'une',
'ueq',
'ogl',
'ne',
'olt',
'lt',
'uge',
'ult',
'oge',
'ole',
'le',
'ugt',
'ule',
'ogt',
# Table 3.2 FPU Comparisons With Special Operand Exceptions for QNaNs
'sf',
'saf',
'st',
'sat',
'ngle',
'sun',
'gle',
'sor',
'seq',
'sne',
'sune',
'ngl',
'sueq',
'gl',
'sne',
'lt',
'slt',
'nlt',
'suge',
'nge',
'sult',
'ge',
'soge',
'le',
'sle',
'nle',
'sugt',
'ngt',
'sule',
'gt',
'sogt',
]
#SCRIPT_PATH = os.path.dirname( os.path.realpath( __file__ ) )
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def extractTextLines(pdf: PyPDF2.PdfFileReader, page: PyPDF2.pdf.PageObject) -> typing.List[str]:
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
files, but poorly for others, depending on the generator used. This will
be refined in the future. Do not rely on the order of text coming out of
this function, as it will change if this function is made more
sophisticated.
:return: a unicode string object.
Source & (c): <PyPDF2.pdf>
"""
logging.info("Func: extractTextLines()")
text = []
content = page.getContents()
if not content:
return None
if not isinstance(content, PyPDF2.pdf.ContentStream):
content = PyPDF2.pdf.ContentStream(content, pdf)
# Note: we check all strings are TextStringObjects. ByteStringObjects
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.
for operands, operator in content.operations:
#logging.debug("operands ({}): {}".format(type(operands), repr(operands)))
#logging.debug("operator ({}): {}".format(type(operator), repr(operator)))
if operator == b"Tj":
_text = operands[0]
if isinstance(_text, PyPDF2.generic.TextStringObject):
text.append(_text)
elif operator == b"T*":
text.append("")
elif operator == b"'":
text.append("")
_text = operands[0]
if isinstance(_text, PyPDF2.generic.TextStringObject):
text.append(operands[0])
elif operator == b'"':
_text = operands[2]
if isinstance(_text, PyPDF2.generic.TextStringObject):
text.append(_text)
elif operator == b"TJ":
__text = ""
for i in operands[0]:
if isinstance(i, PyPDF2.generic.TextStringObject):
__text += i
text.append(__text)
return text
def parseFunctionTitle(title: str) -> typing.List[str]:
"""parseFunctions uses the extracted text to return the functions explained
on that page. The returned functions can contain ".fmt" or other aliases.
"""
logging.info("Func: parseFunctions(title: '{}')".format(title))
functs = []
if len(title) < 1:
return []
elif ('{' in title) and ('}' in title):
# Sample: "b{le,ge,gt,lt,eq,ne}zalc"
first, rest = title.split('{', 1)
rest, last = rest.split('}', 1)
functs = [
"{}{}{}".format(first, part.strip(), last).lower()
for part in rest.split(',')
]
elif (" " in title) and (not "," in title):
# Sample: "aui daui dahi dati"
functs = [part.lower() for part in title.split(' ') if part]
elif ("," in title):
# Sample: "crc32b, crc32h, crc32w, crc32d"
functs = [part.strip().lower()
for part in title.split(',') if part.strip()]
else:
functs = [title.lower()]
return functs
def extractFmtFunctions(function, parsed):
"""extractFmtFunctions creates every possible format for the given function."""
logging.info("Func: extractFmtFunctions()")
functionBase = function[:-3].lower()
logging.debug("functionBase={}".format(repr(functionBase)))
functions = []
for line in parsed:
line = line.lower()
if line.startswith(functionBase):
logging.debug("line -> {}".format(repr(line)))
functions.append(line.split(" ", 1)[0])
return functions
def prepareCondFunctions(function: str) -> typing.List[str]:
"""prepareCondFunctions returns all conditional combinations of the function."""
logging.info("Func: prepareCondFunctions()")
functions = []
for cond in COND_TABLE:
func = function \
.replace(".cond.", ".{}.".format(cond)) \
.replace("<cond>", ".{}.".format(cond)) \
.replace(".condn.", ".{}.".format(cond))
functions.append(func)
return functions
def isConditionalFunction(func: str) -> bool:
return (".cond." in func) or ("<cond>" in func) or (".condn." in func)
def parseFunctionsUsingFormatSection(pageNo: int, lines: typing.List[str]) -> typing.List[str]:
# State-Machine:
# 0 = searching format-section
# 1 = function title
# 2 = function entries for ".fmt" and ".cond" pages
state = 0
preParsedFunctions = []
rawLines = []
for line in lines:
line = line.lower().strip()
if len(line) < 1:
continue
if state == 0:
if "format:" in line:
logging.debug("state=1")
state = 1
continue
elif state == 1:
logging.debug("state before 2")
preParsedFunctions = parseFunctionTitle(line)
if len(preParsedFunctions) < 1:
logging.error("No functions found in format section!")
logging.debug("state=2")
state = 2
logging.debug("preParsedFunctions(len={})={}".format(
len(preParsedFunctions),
repr(preParsedFunctions)
))
elif state == 2:
if ":" in line:
# we reached a new section, bail!
logging.debug("new section!")
break
rawLines.append(line)
logging.debug("rawLines(len={})={}".format(
len(rawLines),
repr(rawLines)
))
if len(preParsedFunctions) < 1:
logging.debug("quick-exit--1")
return []
if (len(rawLines) < 2) and len(preParsedFunctions) > 1:
logging.debug("quick-exit--2")
function = preParsedFunctions[0].split(" ")[0]
if isConditionalFunction(function):
return prepareCondFunctions(function)
return [function]
elif (".fmt" not in preParsedFunctions[0]) and (preParsedFunctions[0] not in rawLines[0]):
logging.debug("quick-exit--3")
return [preParsedFunctions[0].split(" ")[0]]
functions = []
# resolve .fmt
for function in preParsedFunctions:
logging.debug("function - {}".format(function))
if function.endswith(".fmt"):
functions += extractFmtFunctions(function, rawLines)
else:
functions.append(function)
functionsPreParsed = functions
functions = []
# resolve .cond.
for function in functionsPreParsed:
if isConditionalFunction(function):
functions += prepareCondFunctions(function)
else:
functions.append(function)
return functions
def parsePage(pdfReader: PyPDF2.PdfFileReader, pageNo: int) -> typing.Tuple[bool, str]:
logging.info("Func: parsePage(pageNo: {})".format(pageNo))
pageObj = pdfReader.getPage(pageNo-1)
try:
parsed = extractTextLines(pdfReader, pageObj)
#logging.debug("extractTextLines: Result({} lines): {}".format(len(parsed), repr(parsed)))
if parsed is None:
logging.warning("Content scraping failed for page: {}".format(pageNo))
if (not parsed) or len(parsed) < 2:
return False, None
except:
return False, None
functions = parseFunctionsUsingFormatSection(pageNo, parsed)
if len(functions) < 1:
return False, None
return True, [(functionName.lower(), pageNo,) for functionName in functions]
def parsePdf(filename: str, startPage: int, endPage: int) -> bool:
"""parsePdf parses the instruction pages and prints the index."""
logging.info("Func: parsePdf()")
alreadyParsedFunctions = []
with open(filename, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
try:
# iterate instruction pages
for pageNo in range(startPage, endPage+1):
res, out = parsePage(pdfReader, pageNo)
#res, out = parsePage(pdfReader, 51)
if not res:
eprint("[E] Parsing failed for page {}!".format(pageNo))
#raise Exception("breakout1!")
continue
# out can be None if the functions on the page were already
# parsed by a previous page.
for func, pageNo in out:
if not func in alreadyParsedFunctions:
alreadyParsedFunctions.append(func)
print('{},{}'.format(func, pageNo))
#raise Exception("breakout2!")
finally:
pdfFileObj.close()
def main(argv: list):
logging.basicConfig(level=LOG_LEVEL)
if len(argv) != 4:
eprint("{} [pdf-file] [start-page] [end-page]".format(argv[0]))
return False
filename = argv[1]
startPage = int(argv[2])
endPage = int(argv[3])
return parsePdf(filename, startPage, endPage)
if __name__ == "__main__":
import sys
sys.exit(not main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment