Skip to content

Instantly share code, notes, and snippets.

View Sirsirious's full-sized avatar

Tiago Duque Sirsirious

  • Brazil
View GitHub Profile
@Sirsirious
Sirsirious / DummySentencizer Constructor
Last active February 5, 2020 17:23
DummySentencizer for NLP Series
class DummySentencizer:
def __init__(self, input_text, split_characters=['.','?','!',':'], delimiter_token='<SPLIT>'):
self.sentences = []
self.raw = str(input_text)
self._split_characters=split_characters
self._delimiter_token=delimiter_token
self._index=0
self._sentencize()
@Sirsirious
Sirsirious / tokenization.py
Last active February 4, 2020 18:52
The sentencizer for our DummySentencizer
def _sentencize(self):
work_sentence = self.raw
for character in self._split_characters:
work_sentence = work_sentence.replace(character, character+""+self._delimiter_token)
self.sentences = [x.strip() for x in work_sentence.split(self._delimiter_token) if x !='']
@Sirsirious
Sirsirious / tokenization.py
Last active February 4, 2020 18:52
How to make our DummySentencizer iterable.
def __iter__(self):
return self
def __next__(self):
if self._index < len(self.sentences):
result = self.sentences[self._index]
self._index+=1
return result
raise StopIteration
@Sirsirious
Sirsirious / tokenization.py
Last active February 5, 2020 17:23
Making our DummyTokenizer.
import string
class DummyTokenizer:
def __init__(self, sentence, token_boundaries=[' ', '-'],
punctuations=string.punctuation, delimiter_token='<SPLIT>'):
self.tokens = []
self.raw = str(sentence)
self._token_boundaries = token_boundaries
self._delimiter_token = delimiter_token
self._punctuations = punctuations
self._index = 0
@Sirsirious
Sirsirious / tokenization.py
Last active February 4, 2020 18:51
The function to do the Tokenization
def _tokenize(self):
work_sentence = self.raw
for punctuation in self._punctuations:
work_sentence = work_sentence.replace(punctuation,
" "+punctuation+" ")
for delimiter in self._token_boundaries:
work_sentence = work_sentence.replace(delimiter,
self._delimiter_token)
self.tokens = [x.strip() for x in work_sentence.split(self._delimiter_token) if x != '']
@Sirsirious
Sirsirious / tokenization.py
Last active February 4, 2020 18:51
To iterate our DummyTokenizer
def __iter__(self):
return self
def __next__(self):
if self._index < len(self.tokens):
result = self.tokens[self._index]
self._index+=1
return result
raise StopIteration
@Sirsirious
Sirsirious / structures.py
Last active February 5, 2020 17:22
The main parts of our structure.
class Document:
def __init__(self, document_text):
self.raw = document_text
self.sentences = sentencize(self.raw)
self._index = 0
#[...]
class Sentence:
def __init__(self, start_position, end_position, raw_document_reference):
@Sirsirious
Sirsirious / structures.py
Last active February 4, 2020 17:32
How to deal with printing and comparison with a reference-based system.
#[...]
def get(self):
if self.SOS:
return '<SOS>'
elif self.EOS:
return '<EOS>'
else:
return self._sentence_string[self.start_pos:self.end_pos]
# Displays the Token value in the terminal if the variable is called
def __repr__(self):
@Sirsirious
Sirsirious / structures.py
Created February 4, 2020 18:13
Regex for sentence boundary and for punctuation escaping.
DEFAULT_SENTENCE_BOUNDARIES = ['(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$)','\.{2,}','\!+','\:+','\?+']
"""
Breaking it down:
(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$) -> looks for ant period that is not preceded or succeded by a digit or other period.
This avoids the algorithm to split sentences at decimal numbers or reticences.
\.{2,} -> captures reticences.
\!+ -> captures series of exclamation points.
\:+ -> captures series of colons.
\?+ -> captures series of question marks.
"""
@Sirsirious
Sirsirious / structures.py
Last active February 4, 2020 18:40
The sentencize function
import re
#[...]
def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'):
working_document = raw_input_document
punctuation_patterns = sentence_boundaries
for punct in punctuation_patterns:
working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE)
list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""]