paceaux/rtfToJson.py

## rtfToJson.py
import os, shutil, sys, getopt, fnmatch, json
from os.path import join
from HTMLParser import HTMLParser
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter
from bs4 import BeautifulSoup

# constants
DEFAULT_DICTIONARY = {
        'chronoNumber': '',
        'chronoContent': ' ',
        'metadata' : {
            'restrictedChrono': False,
            'office': '',
            'primaryAccountingManual': '',
            'alternateAccountingManuals':  [],
            'industries': [],
            'assignedTo': '',
            'dateCleared': '',
            'clearingPartner': '',
            'individual': '',
            'priorConsultations': []
        }
}
POSITION_OF_DATA = 'div > p:nth-of-type(1) + p'
OUTPUT_NAME = 'output.json'
DEBUG = False

class FileRules:
    """ Used for easily passing rules around for how to read and write files
    Attributes:
        path: string, pathname of files
        filePattern: pattern to use to find files
    """

    def __init__(self, filePattern = "*.rtf"):
        """ Inits FileRules with a filePattern"""
        self.path = os.getcwd()
        self.filePattern = filePattern

    @property
    def filePattern(self):
        return self._filePattern.lower()

    @filePattern.setter
    def filePattern(self, new_filePattern):
        if type(new_filePattern) == str:
            self._filePattern = new_filePattern.lower()
        else:
            raise Exception("invalid value for filePattern")

    def PathAndType(self):
        return join("/", self.path, self.filePattern)

def fileList(fileRules):
    """ Returns files as an array

    Args:
        fileRules. A FileRules object

    Returns:
        array
    """
    fileList = []
    directoryList = os.listdir(fileRules.path)
    for entry in directoryList:
        if fnmatch.fnmatch(entry, fileRules.filePattern):
            fileList.append(entry)
        elif fnmatch.fnmatch(entry, fileRules.filePattern.upper()):
            fileList.append(entry)

    return fileList

def getDecodedText(file):
    """  Converts text into utf-8
    Args:
        file: string of the path and filename to a file

    Returns: string of utf-8 text
    """
    lines = open(file).readlines()
    newLines = []

    for line in lines:
        newLine = line.decode('utf-8')
        newLines.append(newLine)

    return ' '.join(newLines)

def getHtmlFromRtf(text):
    """ converts RTF to HTML

    Args:
        text: utf8 encoded text that is RTF content

    Returns:
        string containing valid XHTML
    """
    doc = Rtf15Reader.read(text)
    html = XHTMLWriter.write(doc, pretty=True).read()

    return html

def getCleanedHtml(html):
    """ Removes undesirable markup from html

    Args:
        html: a string that is the path to an HTML file

    returns:
        Soup: a soup object that's been modified
    """
    soup =BeautifulSoup(html, 'html.parser')

    # replace p >u with an h3
    titles = soup.select('p > u:nth-of-type(1)')
    for title in titles:
        if (title.string == title.string.upper()) :
            h3 = soup.new_tag('h3')
            text = title.string
            h3.string = text
            title.find_parent('p').replaceWith(h3)

    # remove u
    underlines = soup.select('strong em u')
    for underline in underlines:
        underline.unwrap()

    # remove empty p
    for p in soup.find_all('p'):
        if (len(p.text)) == 0:
            p.decompose()

    return str(soup)

def getTextFile(fileName, text = ''):
    """ creates a file with text in it

    Args:
        fileName: Name of the file (e.g. foo.html)
        text: Text that should be be in the file

    Returns:
        file
    """
    outputFile = open(fileName, 'w')
    outputFile.write(text)
    outputFile.close()

    return outputFile

def extractMetadata(soup, dataPosition):
    """
    """
    metadataWrapper = soup.select(dataPosition)
    metadataContents = metadataWrapper[0].contents
    metadata = dict(DEFAULT_DICTIONARY['metadata'])
    for idx, metadataTag in enumerate(metadataContents):
        if (metadataTag.string !=  None):
            if "OFFICE" in metadataTag.string:
                metadata['office'] = str(metadataContents[idx + 1]).strip()
            elif "PRIMARY ACCMAN" in metadataTag.string:
                metadata['primaryAccountingManual'] = str(metadataContents[idx + 1]).strip()
            elif "ALTERNATE ACCMAN" in metadataTag.string:
                metadata['alternateAccountingManuals'].append(str(metadataContents[idx + 1]).strip())
            elif "INDUSTRY IDENTIFIER" in metadataTag.string:
                metadata['industries'].append(str(metadataContents[idx + 1]).strip())
            elif "ASSIGNED TO" in metadataTag.string:
                metadata['assignedTo'] = str(metadataContents[idx + 1]).strip()
            elif "DATE CLEARED" in metadataTag.string:
                metadata['dateCleared'] = str(metadataContents[idx + 1]).strip()
            elif "CLEARING PARTNER" in metadataTag.string:
                metadata['clearingPartner'] = str(metadataContents[idx + 1]).strip()
            elif "INDIVIDUAL" in metadataTag.string:
                metadata['individual'] = str(metadataContents[idx + 1]).strip()
            elif "PRIOR CONSULTATIONS" in metadataTag.string:
                metadata['priorConsultations'] = str(metadataContents[idx + 1]).strip()

    return metadata

def extractContent(soup):
    """ extracts only the content part from the soup file
    Args:
        soup: A Beautiful soup object
    Returns:
        a prettified string of HTML
    """
    metadataP = soup.find('p')
    metadataP.decompose()
    soup.find('div').unwrap()

    prettifiedSoup = soup.prettify(formatter='html')
    return str(prettifiedSoup)

def getDictFromHtml(file, defaultDictionary = DEFAULT_DICTIONARY, dataPosition = POSITION_OF_DATA):
    """ produces a Dictionary
    Args:
        file: string. A path to the HTML file to be parsed
        defaultDictionary: Dictionary. presents the data model this becomes
        dataPosition: string. CSS selector that's BeautifulSoup compliant for where to find data
    """
    soup = BeautifulSoup(file, 'html.parser')
    soupClone = BeautifulSoup(file, 'html.parser')
    dictionary = dict(defaultDictionary)
    metadata = extractMetadata(soup, dataPosition)
    content = extractContent(soupClone)
    dictionary['metadata'] = metadata
    dictionary['chronoNumber'] = soup.select(dataPosition)[0].contents[1].strip()
    dictionary['chronoContent'] =  content

    return dictionary

def getAsJson(dictionary):
    return json.dumps(dictionary)

def getRtfAsJson(file, debug = False):
    """ Converts a file from origin to a new version, also cleans it
    Args:
        file: string, name of the file
    Returns:
        dict
    """
    decodedText = getDecodedText(file)
    preHtmlFile = open('preHtml', 'w+')
    preHtmlFile.write(decodedText)
    html = getHtmlFromRtf(preHtmlFile)
    cleanedHtml = getCleanedHtml(html)

    if (debug == True):
    # For Debugging
        htmlFile = getTextFile(file + '.html', str(html))
        cleanedHtmlFile = getTextFile(file + '.cleaned.html', str(cleanedHtml))

    data =  getDictFromHtml(cleanedHtml)

    return data
   # htmlFile.close()
   # preHtmlFile.close()
    # figure out how to delete files

def outputFile(outputName, listOfFiles, debug = DEBUG):
    """ Outputs a converted file
    Args:
        outputName. String. Name of the file to be created
        listOfFiles: List. The files that need to be converted
    """
    if (len(listOfFiles)) > 1:
        jsonArray = []
        for file in listOfFiles:
            convertedFile = getRtfAsJson(file, debug)
            jsonArray.append(convertedFile)
        outputFile = getTextFile(outputName, getAsJson(jsonArray))
    else:
        convertedFile = getRtfAsJson(listOfFiles[0], debug)
        outputFile = getTextFile(outputName, getAsJson(convertedFile))

def main(argv):
    fileRules = FileRules()
    outputName = OUTPUT_NAME
    debug = DEBUG
    try:
        opts, args = getopt.getopt(argv, "p:f:o:d", ["path=", "file=", "output="])
    except getopt.GetoptError:
        print("error with the arguments")
        sys.exit(2)

    for opt, arg in opts:
        if opt in ("-p", "--path"):
            fileRules.path = arg
        elif opt in ("-f", "--file"):
            fileRules.filePattern = arg
        elif opt in ("-o", "--output"):
            outputName = arg
        elif opt in ("-d", "--debug"):
            debug = True

    listOfFiles = fileList(fileRules)

    try:
        outputFile(outputName, listOfFiles, debug)
    except Exception as e:
        print("Error With outputting the file")
        print(e)


if __name__ == "__main__":
    main(sys.argv[1:])
	import os, shutil, sys, getopt, fnmatch, json
	from os.path import join
	from HTMLParser import HTMLParser
	from pyth.plugins.rtf15.reader import Rtf15Reader
	from pyth.plugins.xhtml.writer import XHTMLWriter
	from bs4 import BeautifulSoup

	# constants
	DEFAULT_DICTIONARY = {
	'chronoNumber': '',
	'chronoContent': ' ',
	'metadata' : {
	'restrictedChrono': False,
	'office': '',
	'primaryAccountingManual': '',
	'alternateAccountingManuals': [],
	'industries': [],
	'assignedTo': '',
	'dateCleared': '',
	'clearingPartner': '',
	'individual': '',
	'priorConsultations': []
	}
	}
	POSITION_OF_DATA = 'div > p:nth-of-type(1) + p'
	OUTPUT_NAME = 'output.json'
	DEBUG = False

	class FileRules:
	""" Used for easily passing rules around for how to read and write files
	Attributes:
	path: string, pathname of files
	filePattern: pattern to use to find files
	"""

	def __init__(self, filePattern = "*.rtf"):
	""" Inits FileRules with a filePattern"""
	self.path = os.getcwd()
	self.filePattern = filePattern

	@property
	def filePattern(self):
	return self._filePattern.lower()

	@filePattern.setter
	def filePattern(self, new_filePattern):
	if type(new_filePattern) == str:
	self._filePattern = new_filePattern.lower()
	else:
	raise Exception("invalid value for filePattern")

	def PathAndType(self):
	return join("/", self.path, self.filePattern)

	def fileList(fileRules):
	""" Returns files as an array

	Args:
	fileRules. A FileRules object

	Returns:
	array
	"""
	fileList = []
	directoryList = os.listdir(fileRules.path)
	for entry in directoryList:
	if fnmatch.fnmatch(entry, fileRules.filePattern):
	fileList.append(entry)
	elif fnmatch.fnmatch(entry, fileRules.filePattern.upper()):
	fileList.append(entry)

	return fileList

	def getDecodedText(file):
	""" Converts text into utf-8
	Args:
	file: string of the path and filename to a file

	Returns: string of utf-8 text
	"""
	lines = open(file).readlines()
	newLines = []

	for line in lines:
	newLine = line.decode('utf-8')
	newLines.append(newLine)

	return ' '.join(newLines)

	def getHtmlFromRtf(text):
	""" converts RTF to HTML

	Args:
	text: utf8 encoded text that is RTF content

	Returns:
	string containing valid XHTML
	"""
	doc = Rtf15Reader.read(text)
	html = XHTMLWriter.write(doc, pretty=True).read()

	return html

	def getCleanedHtml(html):
	""" Removes undesirable markup from html

	Args:
	html: a string that is the path to an HTML file

	returns:
	Soup: a soup object that's been modified
	"""
	soup =BeautifulSoup(html, 'html.parser')

	# replace p >u with an h3
	titles = soup.select('p > u:nth-of-type(1)')
	for title in titles:
	if (title.string == title.string.upper()) :
	h3 = soup.new_tag('h3')
	text = title.string
	h3.string = text
	title.find_parent('p').replaceWith(h3)

	# remove u
	underlines = soup.select('strong em u')
	for underline in underlines:
	underline.unwrap()

	# remove empty p
	for p in soup.find_all('p'):
	if (len(p.text)) == 0:
	p.decompose()

	return str(soup)

	def getTextFile(fileName, text = ''):
	""" creates a file with text in it

	Args:
	fileName: Name of the file (e.g. foo.html)
	text: Text that should be be in the file

	Returns:
	file
	"""
	outputFile = open(fileName, 'w')
	outputFile.write(text)
	outputFile.close()

	return outputFile

	def extractMetadata(soup, dataPosition):
	"""
	"""
	metadataWrapper = soup.select(dataPosition)
	metadataContents = metadataWrapper[0].contents
	metadata = dict(DEFAULT_DICTIONARY['metadata'])
	for idx, metadataTag in enumerate(metadataContents):
	if (metadataTag.string != None):
	if "OFFICE" in metadataTag.string:
	metadata['office'] = str(metadataContents[idx + 1]).strip()
	elif "PRIMARY ACCMAN" in metadataTag.string:
	metadata['primaryAccountingManual'] = str(metadataContents[idx + 1]).strip()
	elif "ALTERNATE ACCMAN" in metadataTag.string:
	metadata['alternateAccountingManuals'].append(str(metadataContents[idx + 1]).strip())
	elif "INDUSTRY IDENTIFIER" in metadataTag.string:
	metadata['industries'].append(str(metadataContents[idx + 1]).strip())
	elif "ASSIGNED TO" in metadataTag.string:
	metadata['assignedTo'] = str(metadataContents[idx + 1]).strip()
	elif "DATE CLEARED" in metadataTag.string:
	metadata['dateCleared'] = str(metadataContents[idx + 1]).strip()
	elif "CLEARING PARTNER" in metadataTag.string:
	metadata['clearingPartner'] = str(metadataContents[idx + 1]).strip()
	elif "INDIVIDUAL" in metadataTag.string:
	metadata['individual'] = str(metadataContents[idx + 1]).strip()
	elif "PRIOR CONSULTATIONS" in metadataTag.string:
	metadata['priorConsultations'] = str(metadataContents[idx + 1]).strip()

	return metadata

	def extractContent(soup):
	""" extracts only the content part from the soup file
	Args:
	soup: A Beautiful soup object
	Returns:
	a prettified string of HTML
	"""
	metadataP = soup.find('p')
	metadataP.decompose()
	soup.find('div').unwrap()

	prettifiedSoup = soup.prettify(formatter='html')
	return str(prettifiedSoup)

	def getDictFromHtml(file, defaultDictionary = DEFAULT_DICTIONARY, dataPosition = POSITION_OF_DATA):
	""" produces a Dictionary
	Args:
	file: string. A path to the HTML file to be parsed
	defaultDictionary: Dictionary. presents the data model this becomes
	dataPosition: string. CSS selector that's BeautifulSoup compliant for where to find data
	"""
	soup = BeautifulSoup(file, 'html.parser')
	soupClone = BeautifulSoup(file, 'html.parser')
	dictionary = dict(defaultDictionary)
	metadata = extractMetadata(soup, dataPosition)
	content = extractContent(soupClone)
	dictionary['metadata'] = metadata
	dictionary['chronoNumber'] = soup.select(dataPosition)[0].contents[1].strip()
	dictionary['chronoContent'] = content

	return dictionary

	def getAsJson(dictionary):
	return json.dumps(dictionary)

	def getRtfAsJson(file, debug = False):
	""" Converts a file from origin to a new version, also cleans it
	Args:
	file: string, name of the file
	Returns:
	dict
	"""
	decodedText = getDecodedText(file)
	preHtmlFile = open('preHtml', 'w+')
	preHtmlFile.write(decodedText)
	html = getHtmlFromRtf(preHtmlFile)
	cleanedHtml = getCleanedHtml(html)

	if (debug == True):
	# For Debugging
	htmlFile = getTextFile(file + '.html', str(html))
	cleanedHtmlFile = getTextFile(file + '.cleaned.html', str(cleanedHtml))

	data = getDictFromHtml(cleanedHtml)

	return data
	# htmlFile.close()
	# preHtmlFile.close()
	# figure out how to delete files

	def outputFile(outputName, listOfFiles, debug = DEBUG):
	""" Outputs a converted file
	Args:
	outputName. String. Name of the file to be created
	listOfFiles: List. The files that need to be converted
	"""
	if (len(listOfFiles)) > 1:
	jsonArray = []
	for file in listOfFiles:
	convertedFile = getRtfAsJson(file, debug)
	jsonArray.append(convertedFile)
	outputFile = getTextFile(outputName, getAsJson(jsonArray))
	else:
	convertedFile = getRtfAsJson(listOfFiles[0], debug)
	outputFile = getTextFile(outputName, getAsJson(convertedFile))

	def main(argv):
	fileRules = FileRules()
	outputName = OUTPUT_NAME
	debug = DEBUG
	try:
	opts, args = getopt.getopt(argv, "p:f:o:d", ["path=", "file=", "output="])
	except getopt.GetoptError:
	print("error with the arguments")
	sys.exit(2)

	for opt, arg in opts:
	if opt in ("-p", "--path"):
	fileRules.path = arg
	elif opt in ("-f", "--file"):
	fileRules.filePattern = arg
	elif opt in ("-o", "--output"):
	outputName = arg
	elif opt in ("-d", "--debug"):
	debug = True

	listOfFiles = fileList(fileRules)

	try:
	outputFile(outputName, listOfFiles, debug)
	except Exception as e:
	print("Error With outputting the file")
	print(e)


	if __name__ == "__main__":
	main(sys.argv[1:])