Instantly share code, notes, and snippets.

Embed
What would you like to do?
Function to extract text in RTF files.
# -*- coding: utf-8 -*-
"""
Extract text in RTF Files. Refactored to use with Python 3.x
Source:
http://stackoverflow.com/a/188877
Code created by Markus Jarderot: http://mizardx.blogspot.com
"""
import re
def striprtf(text):
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# control words which specify a "destionation".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# Translation of some special characters.
specialchars = {
'par': '\n',
'sect': '\n\n',
'page': '\n\n',
'line': '\n',
'tab': '\t',
'emdash': '\u2014',
'endash': '\u2013',
'emspace': '\u2003',
'enspace': '\u2002',
'qmspace': '\u2005',
'bullet': '\u2022',
'lquote': '\u2018',
'rquote': '\u2019',
'ldblquote': '\201C',
'rdblquote': '\u201D',
}
stack = []
ignorable = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = [] # Output buffer.
for match in pattern.finditer(text.decode()):
word,arg,hex,char,brace,tchar = match.groups()
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip,ignorable))
elif brace == '}':
# Pop state
ucskip,ignorable = stack.pop()
elif char: # \x (not a letter)
curskip = 0
if char == '~':
if not ignorable:
out.append('\xA0')
elif char in '{}\\':
if not ignorable:
out.append(char)
elif char == '*':
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
elif ignorable:
pass
elif word in specialchars:
out.append(specialchars[word])
elif word == 'uc':
ucskip = int(arg)
elif word == 'u':
c = int(arg)
if c < 0: c += 0x10000
if c > 127: out.append(chr(c)) #NOQA
else: out.append(chr(c))
curskip = ucskip
elif hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex,16)
if c > 127: out.append(chr(c)) #NOQA
else: out.append(chr(c))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)
@lr1980

This comment has been minimized.

lr1980 commented Mar 21, 2017

license? :)

@cossio

This comment has been minimized.

cossio commented Nov 16, 2018

@lr1980 WTFPL? ;)

@cossio

This comment has been minimized.

cossio commented Nov 16, 2018

Why text.decode() instead of just text? This way I can't pass a string to it. What is the argument supposed to be?
Also this is striping all newlines from my rtf file.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment