msakuta/extract_rtf.py

## extract_rtf.py
# -*- coding: utf-8 -*-

"""
Extract text in RTF Files. Refactored to use with Python 3.x
Source:
    http://stackoverflow.com/a/188877

Code created by Markus Jarderot: http://mizardx.blogspot.com

Modified to use with Python 2.7
"""

import sys
import re
import codecs

def striprtf(text):
   pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
   # control words which specify a "destionation".
   destinations = frozenset((
      'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
      'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
      'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
      'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
      'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
      'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
      'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
      'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
      'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
      'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
      'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
      'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
      'listoverridetable','listpicture','liststylename','listtable','listtext',
      'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
      'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
      'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
      'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
      'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
      'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
      'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
      'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
      'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
      'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
      'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
      'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
      'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
      'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
      'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
      'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
      'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
      'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
      'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
      'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
      'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
      'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
      'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
      'svb','tc','template','themedata','title','txe','ud','upr','userprops',
      'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
      'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
      'xmlopen',
   ))
   # Translation of some special characters.
   specialchars = {
      'par': '\n',
      'sect': '\n\n',
      'page': '\n\n',
      'line': '\n',
      'tab': '\t',
      'emdash': '\u2014',
      'endash': '\u2013',
      'emspace': '\u2003',
      'enspace': '\u2002',
      'qmspace': '\u2005',
      'bullet': '\u2022',
      'lquote': '\u2018',
      'rquote': '\u2019',
      'ldblquote': '\201C',
      'rdblquote': '\u201D',
   }
   stack = []
   ignorable = False       # Whether this group (and all inside it) are "ignorable".
   ucskip = 1              # Number of ASCII characters to skip after a unicode character.
   curskip = 0             # Number of ASCII characters left to skip
   out = []                # Output buffer.
   for match in pattern.finditer(text.decode()):
      word,arg,hex,char,brace,tchar = match.groups()
      if brace:
         curskip = 0
         if brace == '{':
            # Push state
            stack.append((ucskip,ignorable))
         elif brace == '}':
            # Pop state
            ucskip,ignorable = stack.pop()
      elif char: # \x (not a letter)
         curskip = 0
         if char == '~':
            if not ignorable:
                out.append(unicode('\xA0', encoding='latin1'))
         elif char in '{}\\':
            if not ignorable:
               out.append(char)
         elif char == '*':
            ignorable = True
      elif word: # \foo
         curskip = 0
         if word in destinations:
            ignorable = True
         elif ignorable:
            pass
         elif word in specialchars:
            out.append(specialchars[word])
         elif word == 'uc':
            ucskip = int(arg)
         elif word == 'u':
            c = int(arg)
            if c < 0: c += 0x10000
            if c > 127: out.append(unichr(c)) #NOQA
            else: out.append(chr(c))
            curskip = ucskip
      elif hex: # \'xx
         if curskip > 0:
            curskip -= 1
         elif not ignorable:
            c = int(hex,16)
            if c > 127: out.append(unichr(c)) #NOQA
            else: out.append(chr(c))
      elif tchar:
         if curskip > 0:
            curskip -= 1
         elif not ignorable:
            out.append(tchar)
   return unicode('').join(out)

if len(sys.argv) < 2:
	print('insufficient args')
	exit()

sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
with open(sys.argv[1]) as f:
	print(striprtf(f.read()))
	# -- coding: utf-8 --

	"""
	Extract text in RTF Files. Refactored to use with Python 3.x
	Source:
	http://stackoverflow.com/a/188877

	Code created by Markus Jarderot: http://mizardx.blogspot.com

	Modified to use with Python 2.7
	"""

	import sys
	import re
	import codecs

	def striprtf(text):
	pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?\|\\'([0-9a-f]{2})\|\\([^a-z])\|([{}])\|[\r\n]+\|(.)", re.I)
	# control words which specify a "destionation".
	destinations = frozenset((
	'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
	'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
	'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
	'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
	'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
	'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
	'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
	'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
	'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
	'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
	'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
	'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
	'listoverridetable','listpicture','liststylename','listtable','listtext',
	'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
	'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
	'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
	'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
	'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
	'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
	'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
	'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
	'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
	'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
	'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
	'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
	'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
	'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
	'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
	'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
	'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
	'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
	'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
	'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
	'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
	'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
	'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
	'svb','tc','template','themedata','title','txe','ud','upr','userprops',
	'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
	'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
	'xmlopen',
	))
	# Translation of some special characters.
	specialchars = {
	'par': '\n',
	'sect': '\n\n',
	'page': '\n\n',
	'line': '\n',
	'tab': '\t',
	'emdash': '\u2014',
	'endash': '\u2013',
	'emspace': '\u2003',
	'enspace': '\u2002',
	'qmspace': '\u2005',
	'bullet': '\u2022',
	'lquote': '\u2018',
	'rquote': '\u2019',
	'ldblquote': '\201C',
	'rdblquote': '\u201D',
	}
	stack = []
	ignorable = False # Whether this group (and all inside it) are "ignorable".
	ucskip = 1 # Number of ASCII characters to skip after a unicode character.
	curskip = 0 # Number of ASCII characters left to skip
	out = [] # Output buffer.
	for match in pattern.finditer(text.decode()):
	word,arg,hex,char,brace,tchar = match.groups()
	if brace:
	curskip = 0
	if brace == '{':
	# Push state
	stack.append((ucskip,ignorable))
	elif brace == '}':
	# Pop state
	ucskip,ignorable = stack.pop()
	elif char: # \x (not a letter)
	curskip = 0
	if char == '~':
	if not ignorable:
	out.append(unicode('\xA0', encoding='latin1'))
	elif char in '{}\\':
	if not ignorable:
	out.append(char)
	elif char == '*':
	ignorable = True
	elif word: # \foo
	curskip = 0
	if word in destinations:
	ignorable = True
	elif ignorable:
	pass
	elif word in specialchars:
	out.append(specialchars[word])
	elif word == 'uc':
	ucskip = int(arg)
	elif word == 'u':
	c = int(arg)
	if c < 0: c += 0x10000
	if c > 127: out.append(unichr(c)) #NOQA
	else: out.append(chr(c))
	curskip = ucskip
	elif hex: # \'xx
	if curskip > 0:
	curskip -= 1
	elif not ignorable:
	c = int(hex,16)
	if c > 127: out.append(unichr(c)) #NOQA
	else: out.append(chr(c))
	elif tchar:
	if curskip > 0:
	curskip -= 1
	elif not ignorable:
	out.append(tchar)
	return unicode('').join(out)

	if len(sys.argv) < 2:
	print('insufficient args')
	exit()

	sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
	with open(sys.argv[1]) as f:
	print(striprtf(f.read()))