Skip to content

Instantly share code, notes, and snippets.

@vzhong
Created October 19, 2016 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vzhong/4a4dfea0037b16d584cc3b8b63f4a7e6 to your computer and use it in GitHub Desktop.
Save vzhong/4a4dfea0037b16d584cc3b8b63f4a7e6 to your computer and use it in GitHub Desktop.
Converts from character offsets to word offsets
sents = [
'i like sandwiches',
'the Fr. madison is a great friar',
]
char_offsets = [
(2, 6),
(4, 15),
]
def tokenize(sent):
return sent.split()
def tokenize_and_convert_to_word_indices(sent, char_start, char_end):
proc = ''
for i, c in enumerate(sent):
if i == char_start:
proc += ' ESTART '
if i == char_end:
proc += ' EEND '
proc += c
words = proc.strip().split()
proc = []
word_start = word_end = None
for i, w in enumerate(words):
if w == 'ESTART':
word_start = len(proc)
elif w == 'EEND':
word_end = len(proc)
else:
proc += [w]
assert word_start is not None
assert word_end is not None
return proc, word_start, word_end
if __name__ == '__main__':
for sent, char_offset in zip(sents, char_offsets):
print(sent)
char_start, char_end = char_offset
print(sent[char_start:char_end])
words, word_start, word_end = tokenize_and_convert_to_word_indices(sent, char_start, char_end)
print(words)
print(words[word_start:word_end])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment