benoit-pierre/rtfparse.py

## rtfparse.py
#!/usr/bin/env python

from collections import namedtuple
import sys
import re

from plover.dictionary.base import create_dictionary


RTF_TOKEN = re.compile(r'\\((?P<cchar>[-_~\\{}*])|(?P<cword>[A-Za-z]+)(?P<cparam>-?[0-9]+)? ?)|(?P<group>[{}])|(?P<text>[^\n\r\\{}]+)|(?P<nl>[\n\r]+)')

Token = namedtuple('Token', 'kind value')

class ControlWord(namedtuple('ControlWord', 'kind name param')):

    def __eq__(self, other):
        if isinstance(other, ControlWord):
            other = other.name
        return self.name == other

class Group(object):

    def __init__(self, destination=None, ignorable=False):
        self.destination = destination
        self.ignorable = ignorable
        self.text = ''


text = open(sys.argv[1], 'rb').read().decode('cp1252')
dictionary = create_dictionary(sys.argv[2])

pos = 0
token_list = []
while pos < len(text):
    m = RTF_TOKEN.match(text, pos)
    assert m is not None
    pos = m.end()
    for group_names in (
        ('cchar',),
        ('cword', 'cparam'),
        ('group',),
        ('nl',),
        ('text',)
    ):
        token_value = m.group(*group_names)
        if token_value not in (None, (None, None)):
            token_kind = group_names[0]
            break
    else:
        raise ValueError()
    if token_kind == 'nl':
        continue
    if token_kind == 'cword':
        name, param = token_value
        if param is not None:
            param = int(param)
        token = ControlWord('cword', name, param)
    else:
        token = Token(token_kind, token_value)
    token_list.append(token)

assert token_list[0] == Token('group', '{')
assert token_list[1] == ControlWord('cword', 'rtf', 1)
assert token_list[-1] == Token('group', '}')
token_list = token_list[2:-1]

group = Group(None)
group_stack = [group]
steno = None
n = 0
while n < len(token_list):
    token = token_list[n]
    if token.kind == 'cchar':
        char = token.value
        if char == '*':
            pass
        elif char == '~':
            group.text += '{^ ^}'
        elif char == '_':
            group.text += '{^-^}'
        else:
            group.text += char
    elif token.kind == 'cword':
        if token.name == 'par':
            group.text += '{#Return}{#Return}'
        elif token.name == 'cxds':
            group.text += '{^}'
        elif token.name == 'cxfc':
            group.text += '{-|}'
        elif token.name == 'cxfing':
            next_token = token_list[n + 1]
            assert next_token.kind == 'text'
            group.text += '{&' + next_token.value + '}'
            n += 1
    elif token.kind == 'group':
        if token.value == '{':
            ignorable = False
            destination = None
            next_token = token_list[n + 1]
            if next_token.kind == 'cword':
                n += 1
                destination = next_token
            elif next_token == Token('cchar', '*'):
                ignorable = True
                next_token = token_list[n + 2]
                if next_token.kind == 'cword':
                    n += 2
                    destination = next_token
            if destination == 'cxs':
                assert len(group_stack) == 1
                if steno is not None:
                    dictionary[steno] = group.text
                group.text = ''
            group = Group(destination, ignorable)
            group_stack.append(group)
        elif token.value == '}':
            text = ''
            if group.destination == 'cxs':
                steno = tuple(group.text.split('/'))
            elif group.destination == 'cxp':
                stripped = group.text.strip()
                if stripped in ['.', '!', '?', ',', ';', ':']:
                    text = '{' + stripped + '}'
                elif stripped == "'":
                    text = "{^'}"
                elif stripped in ['-', '/']:
                    text = '{^' + contents + '^}'
                elif stripped:
                    # Show unknown punctuation as given.
                    text = '{^' + contents + '^}'
            elif group.destination == 'cxfing':
                text = '{&' + group.text + '}'
            elif not group.ignorable:
                text = group.text
            group_stack.pop()
            group = group_stack[-1]
            group.text += text
        else:
            raise ValueError()
    elif token.kind == 'text':
        group.text += token.value
    else:
        raise ValueError()
    n += 1
assert len(group_stack) == 1
if steno is not None:
    dictionary[steno] = group.text
dictionary.save()
	#!/usr/bin/env python

	from collections import namedtuple
	import sys
	import re

	from plover.dictionary.base import create_dictionary


	RTF_TOKEN = re.compile(r'\\((?P<cchar>[-_~\\{}*])\|(?P<cword>[A-Za-z]+)(?P<cparam>-?[0-9]+)? ?)\|(?P<group>[{}])\|(?P<text>[^\n\r\\{}]+)\|(?P<nl>[\n\r]+)')

	Token = namedtuple('Token', 'kind value')

	class ControlWord(namedtuple('ControlWord', 'kind name param')):

	def __eq__(self, other):
	if isinstance(other, ControlWord):
	other = other.name
	return self.name == other

	class Group(object):

	def __init__(self, destination=None, ignorable=False):
	self.destination = destination
	self.ignorable = ignorable
	self.text = ''



	text = open(sys.argv[1], 'rb').read().decode('cp1252')
	dictionary = create_dictionary(sys.argv[2])

	pos = 0
	token_list = []
	while pos < len(text):
	m = RTF_TOKEN.match(text, pos)
	assert m is not None
	pos = m.end()
	for group_names in (
	('cchar',),
	('cword', 'cparam'),
	('group',),
	('nl',),
	('text',)
	):
	token_value = m.group(*group_names)
	if token_value not in (None, (None, None)):
	token_kind = group_names[0]
	break
	else:
	raise ValueError()
	if token_kind == 'nl':
	continue
	if token_kind == 'cword':
	name, param = token_value
	if param is not None:
	param = int(param)
	token = ControlWord('cword', name, param)
	else:
	token = Token(token_kind, token_value)
	token_list.append(token)

	assert token_list[0] == Token('group', '{')
	assert token_list[1] == ControlWord('cword', 'rtf', 1)
	assert token_list[-1] == Token('group', '}')
	token_list = token_list[2:-1]

	group = Group(None)
	group_stack = [group]
	steno = None
	n = 0
	while n < len(token_list):
	token = token_list[n]
	if token.kind == 'cchar':
	char = token.value
	if char == '*':
	pass
	elif char == '~':
	group.text += '{^ ^}'
	elif char == '_':
	group.text += '{^-^}'
	else:
	group.text += char
	elif token.kind == 'cword':
	if token.name == 'par':
	group.text += '{#Return}{#Return}'
	elif token.name == 'cxds':
	group.text += '{^}'
	elif token.name == 'cxfc':
	group.text += '{-\|}'
	elif token.name == 'cxfing':
	next_token = token_list[n + 1]
	assert next_token.kind == 'text'
	group.text += '{&' + next_token.value + '}'
	n += 1
	elif token.kind == 'group':
	if token.value == '{':
	ignorable = False
	destination = None
	next_token = token_list[n + 1]
	if next_token.kind == 'cword':
	n += 1
	destination = next_token
	elif next_token == Token('cchar', '*'):
	ignorable = True
	next_token = token_list[n + 2]
	if next_token.kind == 'cword':
	n += 2
	destination = next_token
	if destination == 'cxs':
	assert len(group_stack) == 1
	if steno is not None:
	dictionary[steno] = group.text
	group.text = ''
	group = Group(destination, ignorable)
	group_stack.append(group)
	elif token.value == '}':
	text = ''
	if group.destination == 'cxs':
	steno = tuple(group.text.split('/'))
	elif group.destination == 'cxp':
	stripped = group.text.strip()
	if stripped in ['.', '!', '?', ',', ';', ':']:
	text = '{' + stripped + '}'
	elif stripped == "'":
	text = "{^'}"
	elif stripped in ['-', '/']:
	text = '{^' + contents + '^}'
	elif stripped:
	# Show unknown punctuation as given.
	text = '{^' + contents + '^}'
	elif group.destination == 'cxfing':
	text = '{&' + group.text + '}'
	elif not group.ignorable:
	text = group.text
	group_stack.pop()
	group = group_stack[-1]
	group.text += text
	else:
	raise ValueError()
	elif token.kind == 'text':
	group.text += token.value
	else:
	raise ValueError()
	n += 1
	assert len(group_stack) == 1
	if steno is not None:
	dictionary[steno] = group.text
	dictionary.save()