Last active
August 20, 2016 22:30
-
-
Save ansonl/d619c4d4ec86d9578abd0d1ca42e7ffa to your computer and use it in GitHub Desktop.
Function to extract text in RTF files. Modified to show the text of hyperlinks by removing 'field' and 'fldrslt' from ignored destination words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Extract text in RTF Files. Refactored to use with Python 3.x | |
Source: | |
http://stackoverflow.com/a/188877 | |
Code created by Markus Jarderot: http://mizardx.blogspot.com | |
""" | |
import re | |
def striprtf(text): | |
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) | |
# control words which specify a "destination". | |
destinations = frozenset(( | |
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', | |
'atnparent','atnref','atntime','atrfend','atrfstart','author','background', | |
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping', | |
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap', | |
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', | |
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', | |
'ffname','ffstattext','file','filetbl','fldinst','fldtype', | |
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr', | |
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', | |
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', | |
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', | |
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride', | |
'listoverridetable','listpicture','liststylename','listtable','listtext', | |
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr', | |
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr', | |
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me', | |
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr', | |
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag', | |
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname', | |
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr', | |
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject', | |
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname', | |
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl', | |
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr', | |
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu', | |
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr', | |
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup', | |
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide', | |
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol', | |
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables', | |
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops', | |
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password', | |
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta', | |
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe', | |
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst', | |
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv', | |
'svb','tc','template','themedata','title','txe','ud','upr','userprops', | |
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', | |
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', | |
'xmlopen', | |
)) | |
# Translation of some special characters. | |
specialchars = { | |
'par': '\n', | |
'sect': '\n\n', | |
'page': '\n\n', | |
'line': '\n', | |
'tab': '\t', | |
'emdash': '\u2014', | |
'endash': '\u2013', | |
'emspace': '\u2003', | |
'enspace': '\u2002', | |
'qmspace': '\u2005', | |
'bullet': '\u2022', | |
'lquote': '\u2018', | |
'rquote': '\u2019', | |
'ldblquote': '\201C', | |
'rdblquote': '\u201D', | |
} | |
stack = [] | |
ignorable = False # Whether this group (and all inside it) are "ignorable". | |
ucskip = 1 # Number of ASCII characters to skip after a unicode character. | |
curskip = 0 # Number of ASCII characters left to skip | |
out = [] # Output buffer. | |
for match in pattern.finditer(text.decode()): | |
word,arg,hex,char,brace,tchar = match.groups() | |
if brace: | |
curskip = 0 | |
if brace == '{': | |
# Push state | |
stack.append((ucskip,ignorable)) | |
elif brace == '}': | |
# Pop state | |
ucskip,ignorable = stack.pop() | |
elif char: # \x (not a letter) | |
curskip = 0 | |
if char == '~': | |
if not ignorable: | |
out.append('\xA0') | |
elif char in '{}\\': | |
if not ignorable: | |
out.append(char) | |
elif char == '*': | |
ignorable = True | |
elif word: # \foo | |
curskip = 0 | |
if word in destinations: | |
ignorable = True | |
elif ignorable: | |
pass | |
elif word in specialchars: | |
out.append(specialchars[word]) | |
elif word == 'uc': | |
ucskip = int(arg) | |
elif word == 'u': | |
c = int(arg) | |
if c < 0: c += 0x10000 | |
if c > 127: out.append(chr(c)) #NOQA | |
else: out.append(chr(c)) | |
curskip = ucskip | |
elif hex: # \'xx | |
if curskip > 0: | |
curskip -= 1 | |
elif not ignorable: | |
c = int(hex,16) | |
if c > 127: out.append(chr(c)) #NOQA | |
else: out.append(chr(c)) | |
elif tchar: | |
if curskip > 0: | |
curskip -= 1 | |
elif not ignorable: | |
out.append(tchar) | |
return ''.join(out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment