vzhong/CharToWordOffset.py

## CharToWordOffset.py

sents = [
    'i like sandwiches',
    'the Fr. madison is a great friar',
]

char_offsets = [
    (2, 6),
    (4, 15),
]


def tokenize(sent):
    return sent.split()


def tokenize_and_convert_to_word_indices(sent, char_start, char_end):
    proc = ''
    for i, c in enumerate(sent):
        if i == char_start:
            proc += ' ESTART '
        if i == char_end:
            proc += ' EEND '
        proc += c

    words = proc.strip().split()
    proc = []
    word_start = word_end = None
    for i, w in enumerate(words):
        if w == 'ESTART':
            word_start = len(proc)
        elif w == 'EEND':
            word_end = len(proc)
        else:
            proc += [w]

    assert word_start is not None
    assert word_end is not None
    return proc, word_start, word_end


if __name__ == '__main__':

    for sent, char_offset in zip(sents, char_offsets):
        print(sent)
        char_start, char_end = char_offset
        print(sent[char_start:char_end])
        words, word_start, word_end = tokenize_and_convert_to_word_indices(sent, char_start, char_end)
        print(words)
        print(words[word_start:word_end])
        print()

	sents = [
	'i like sandwiches',
	'the Fr. madison is a great friar',
	]

	char_offsets = [
	(2, 6),
	(4, 15),
	]


	def tokenize(sent):
	return sent.split()


	def tokenize_and_convert_to_word_indices(sent, char_start, char_end):
	proc = ''
	for i, c in enumerate(sent):
	if i == char_start:
	proc += ' ESTART '
	if i == char_end:
	proc += ' EEND '
	proc += c

	words = proc.strip().split()
	proc = []
	word_start = word_end = None
	for i, w in enumerate(words):
	if w == 'ESTART':
	word_start = len(proc)
	elif w == 'EEND':
	word_end = len(proc)
	else:
	proc += [w]

	assert word_start is not None
	assert word_end is not None
	return proc, word_start, word_end


	if __name__ == '__main__':

	for sent, char_offset in zip(sents, char_offsets):
	print(sent)
	char_start, char_end = char_offset
	print(sent[char_start:char_end])
	words, word_start, word_end = tokenize_and_convert_to_word_indices(sent, char_start, char_end)
	print(words)
	print(words[word_start:word_end])
	print()