Skip to content

Instantly share code, notes, and snippets.

@paceaux paceaux/rtfToJson.py
Last active Feb 8, 2019

Embed
What would you like to do?
RTF to JSON parser
import os, shutil, sys, getopt, fnmatch, json
from os.path import join
from HTMLParser import HTMLParser
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter
from bs4 import BeautifulSoup
# constants
DEFAULT_DICTIONARY = {
'chronoNumber': '',
'chronoContent': ' ',
'metadata' : {
'restrictedChrono': False,
'office': '',
'primaryAccountingManual': '',
'alternateAccountingManuals': [],
'industries': [],
'assignedTo': '',
'dateCleared': '',
'clearingPartner': '',
'individual': '',
'priorConsultations': []
}
}
POSITION_OF_DATA = 'div > p:nth-of-type(1) + p'
OUTPUT_NAME = 'output.json'
DEBUG = False
class FileRules:
""" Used for easily passing rules around for how to read and write files
Attributes:
path: string, pathname of files
filePattern: pattern to use to find files
"""
def __init__(self, filePattern = "*.rtf"):
""" Inits FileRules with a filePattern"""
self.path = os.getcwd()
self.filePattern = filePattern
@property
def filePattern(self):
return self._filePattern.lower()
@filePattern.setter
def filePattern(self, new_filePattern):
if type(new_filePattern) == str:
self._filePattern = new_filePattern.lower()
else:
raise Exception("invalid value for filePattern")
def PathAndType(self):
return join("/", self.path, self.filePattern)
def fileList(fileRules):
""" Returns files as an array
Args:
fileRules. A FileRules object
Returns:
array
"""
fileList = []
directoryList = os.listdir(fileRules.path)
for entry in directoryList:
if fnmatch.fnmatch(entry, fileRules.filePattern):
fileList.append(entry)
elif fnmatch.fnmatch(entry, fileRules.filePattern.upper()):
fileList.append(entry)
return fileList
def getDecodedText(file):
""" Converts text into utf-8
Args:
file: string of the path and filename to a file
Returns: string of utf-8 text
"""
lines = open(file).readlines()
newLines = []
for line in lines:
newLine = line.decode('utf-8')
newLines.append(newLine)
return ' '.join(newLines)
def getHtmlFromRtf(text):
""" converts RTF to HTML
Args:
text: utf8 encoded text that is RTF content
Returns:
string containing valid XHTML
"""
doc = Rtf15Reader.read(text)
html = XHTMLWriter.write(doc, pretty=True).read()
return html
def getCleanedHtml(html):
""" Removes undesirable markup from html
Args:
html: a string that is the path to an HTML file
returns:
Soup: a soup object that's been modified
"""
soup =BeautifulSoup(html, 'html.parser')
# replace p >u with an h3
titles = soup.select('p > u:nth-of-type(1)')
for title in titles:
if (title.string == title.string.upper()) :
h3 = soup.new_tag('h3')
text = title.string
h3.string = text
title.find_parent('p').replaceWith(h3)
# remove u
underlines = soup.select('strong em u')
for underline in underlines:
underline.unwrap()
# remove empty p
for p in soup.find_all('p'):
if (len(p.text)) == 0:
p.decompose()
return str(soup)
def getTextFile(fileName, text = ''):
""" creates a file with text in it
Args:
fileName: Name of the file (e.g. foo.html)
text: Text that should be be in the file
Returns:
file
"""
outputFile = open(fileName, 'w')
outputFile.write(text)
outputFile.close()
return outputFile
def extractMetadata(soup, dataPosition):
"""
"""
metadataWrapper = soup.select(dataPosition)
metadataContents = metadataWrapper[0].contents
metadata = dict(DEFAULT_DICTIONARY['metadata'])
for idx, metadataTag in enumerate(metadataContents):
if (metadataTag.string != None):
if "OFFICE" in metadataTag.string:
metadata['office'] = str(metadataContents[idx + 1]).strip()
elif "PRIMARY ACCMAN" in metadataTag.string:
metadata['primaryAccountingManual'] = str(metadataContents[idx + 1]).strip()
elif "ALTERNATE ACCMAN" in metadataTag.string:
metadata['alternateAccountingManuals'].append(str(metadataContents[idx + 1]).strip())
elif "INDUSTRY IDENTIFIER" in metadataTag.string:
metadata['industries'].append(str(metadataContents[idx + 1]).strip())
elif "ASSIGNED TO" in metadataTag.string:
metadata['assignedTo'] = str(metadataContents[idx + 1]).strip()
elif "DATE CLEARED" in metadataTag.string:
metadata['dateCleared'] = str(metadataContents[idx + 1]).strip()
elif "CLEARING PARTNER" in metadataTag.string:
metadata['clearingPartner'] = str(metadataContents[idx + 1]).strip()
elif "INDIVIDUAL" in metadataTag.string:
metadata['individual'] = str(metadataContents[idx + 1]).strip()
elif "PRIOR CONSULTATIONS" in metadataTag.string:
metadata['priorConsultations'] = str(metadataContents[idx + 1]).strip()
return metadata
def extractContent(soup):
""" extracts only the content part from the soup file
Args:
soup: A Beautiful soup object
Returns:
a prettified string of HTML
"""
metadataP = soup.find('p')
metadataP.decompose()
soup.find('div').unwrap()
prettifiedSoup = soup.prettify(formatter='html')
return str(prettifiedSoup)
def getDictFromHtml(file, defaultDictionary = DEFAULT_DICTIONARY, dataPosition = POSITION_OF_DATA):
""" produces a Dictionary
Args:
file: string. A path to the HTML file to be parsed
defaultDictionary: Dictionary. presents the data model this becomes
dataPosition: string. CSS selector that's BeautifulSoup compliant for where to find data
"""
soup = BeautifulSoup(file, 'html.parser')
soupClone = BeautifulSoup(file, 'html.parser')
dictionary = dict(defaultDictionary)
metadata = extractMetadata(soup, dataPosition)
content = extractContent(soupClone)
dictionary['metadata'] = metadata
dictionary['chronoNumber'] = soup.select(dataPosition)[0].contents[1].strip()
dictionary['chronoContent'] = content
return dictionary
def getAsJson(dictionary):
return json.dumps(dictionary)
def getRtfAsJson(file, debug = False):
""" Converts a file from origin to a new version, also cleans it
Args:
file: string, name of the file
Returns:
dict
"""
decodedText = getDecodedText(file)
preHtmlFile = open('preHtml', 'w+')
preHtmlFile.write(decodedText)
html = getHtmlFromRtf(preHtmlFile)
cleanedHtml = getCleanedHtml(html)
if (debug == True):
# For Debugging
htmlFile = getTextFile(file + '.html', str(html))
cleanedHtmlFile = getTextFile(file + '.cleaned.html', str(cleanedHtml))
data = getDictFromHtml(cleanedHtml)
return data
# htmlFile.close()
# preHtmlFile.close()
# figure out how to delete files
def outputFile(outputName, listOfFiles, debug = DEBUG):
""" Outputs a converted file
Args:
outputName. String. Name of the file to be created
listOfFiles: List. The files that need to be converted
"""
if (len(listOfFiles)) > 1:
jsonArray = []
for file in listOfFiles:
convertedFile = getRtfAsJson(file, debug)
jsonArray.append(convertedFile)
outputFile = getTextFile(outputName, getAsJson(jsonArray))
else:
convertedFile = getRtfAsJson(listOfFiles[0], debug)
outputFile = getTextFile(outputName, getAsJson(convertedFile))
def main(argv):
fileRules = FileRules()
outputName = OUTPUT_NAME
debug = DEBUG
try:
opts, args = getopt.getopt(argv, "p:f:o:d", ["path=", "file=", "output="])
except getopt.GetoptError:
print("error with the arguments")
sys.exit(2)
for opt, arg in opts:
if opt in ("-p", "--path"):
fileRules.path = arg
elif opt in ("-f", "--file"):
fileRules.filePattern = arg
elif opt in ("-o", "--output"):
outputName = arg
elif opt in ("-d", "--debug"):
debug = True
listOfFiles = fileList(fileRules)
try:
outputFile(outputName, listOfFiles, debug)
except Exception as e:
print("Error With outputting the file")
print(e)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.