This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DummySentencizer: | |
def __init__(self, input_text, split_characters=['.','?','!',':'], delimiter_token='<SPLIT>'): | |
self.sentences = [] | |
self.raw = str(input_text) | |
self._split_characters=split_characters | |
self._delimiter_token=delimiter_token | |
self._index=0 | |
self._sentencize() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _sentencize(self): | |
work_sentence = self.raw | |
for character in self._split_characters: | |
work_sentence = work_sentence.replace(character, character+""+self._delimiter_token) | |
self.sentences = [x.strip() for x in work_sentence.split(self._delimiter_token) if x !=''] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def __iter__(self): | |
return self | |
def __next__(self): | |
if self._index < len(self.sentences): | |
result = self.sentences[self._index] | |
self._index+=1 | |
return result | |
raise StopIteration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
class DummyTokenizer: | |
def __init__(self, sentence, token_boundaries=[' ', '-'], | |
punctuations=string.punctuation, delimiter_token='<SPLIT>'): | |
self.tokens = [] | |
self.raw = str(sentence) | |
self._token_boundaries = token_boundaries | |
self._delimiter_token = delimiter_token | |
self._punctuations = punctuations | |
self._index = 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _tokenize(self): | |
work_sentence = self.raw | |
for punctuation in self._punctuations: | |
work_sentence = work_sentence.replace(punctuation, | |
" "+punctuation+" ") | |
for delimiter in self._token_boundaries: | |
work_sentence = work_sentence.replace(delimiter, | |
self._delimiter_token) | |
self.tokens = [x.strip() for x in work_sentence.split(self._delimiter_token) if x != ''] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def __iter__(self): | |
return self | |
def __next__(self): | |
if self._index < len(self.tokens): | |
result = self.tokens[self._index] | |
self._index+=1 | |
return result | |
raise StopIteration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Document: | |
def __init__(self, document_text): | |
self.raw = document_text | |
self.sentences = sentencize(self.raw) | |
self._index = 0 | |
#[...] | |
class Sentence: | |
def __init__(self, start_position, end_position, raw_document_reference): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[...] | |
def get(self): | |
if self.SOS: | |
return '<SOS>' | |
elif self.EOS: | |
return '<EOS>' | |
else: | |
return self._sentence_string[self.start_pos:self.end_pos] | |
# Displays the Token value in the terminal if the variable is called | |
def __repr__(self): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DEFAULT_SENTENCE_BOUNDARIES = ['(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$)','\.{2,}','\!+','\:+','\?+'] | |
""" | |
Breaking it down: | |
(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$) -> looks for ant period that is not preceded or succeded by a digit or other period. | |
This avoids the algorithm to split sentences at decimal numbers or reticences. | |
\.{2,} -> captures reticences. | |
\!+ -> captures series of exclamation points. | |
\:+ -> captures series of colons. | |
\?+ -> captures series of question marks. | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
#[...] | |
def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'): | |
working_document = raw_input_document | |
punctuation_patterns = sentence_boundaries | |
for punct in punctuation_patterns: | |
working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE) | |
list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""] |
OlderNewer