Skip to content

Instantly share code, notes, and snippets.

@Sirsirious
Last active February 4, 2020 18:40
Show Gist options
  • Save Sirsirious/455cc9e636602864c7bef2fc59cfa2bd to your computer and use it in GitHub Desktop.
Save Sirsirious/455cc9e636602864c7bef2fc59cfa2bd to your computer and use it in GitHub Desktop.
The sentencize function
import re
#[...]
def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'):
working_document = raw_input_document
punctuation_patterns = sentence_boundaries
for punct in punctuation_patterns:
working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE)
list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""]
list_of_sentences = []
previous = None
for sent in list_of_string_sentences:
start_pos = raw_input_document.find(sent)
end_pos = start_pos+len(sent)
new_sentence = Sentence(start_pos, end_pos, raw_input_document)
list_of_sentences.append(new_sentence)
if previous == None:
previous = new_sentence
else:
previous.next_sentence = new_sentence
new_sentence.previous_sentence = previous
previous = new_sentence
return list_of_sentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment