Tiago Duque Sirsirious

## DummySentencizer Constructor
class DummySentencizer:
   def __init__(self, input_text, split_characters=['.','?','!',':'], delimiter_token='<SPLIT>'):
        self.sentences = []
        self.raw = str(input_text)
        self._split_characters=split_characters
        self._delimiter_token=delimiter_token
        self._index=0
        self._sentencize()

## tokenization.py
def _sentencize(self):
  work_sentence = self.raw
  for character in self._split_characters:
    work_sentence = work_sentence.replace(character, character+""+self._delimiter_token)
  self.sentences = [x.strip() for x in work_sentence.split(self._delimiter_token) if x !='']

## tokenization.py
def __iter__(self):
  return self
def __next__(self):
  if self._index < len(self.sentences):
      result = self.sentences[self._index]
      self._index+=1
      return result
  raise StopIteration

## tokenization.py
import string
class DummyTokenizer:
  def __init__(self, sentence, token_boundaries=[' ', '-'],
    punctuations=string.punctuation, delimiter_token='<SPLIT>'):
      self.tokens = []
      self.raw = str(sentence)
      self._token_boundaries = token_boundaries
      self._delimiter_token = delimiter_token
      self._punctuations = punctuations
      self._index = 0

## tokenization.py
def _tokenize(self):
  work_sentence = self.raw
  for punctuation in self._punctuations:
      work_sentence = work_sentence.replace(punctuation,
        " "+punctuation+" ")
  for delimiter in self._token_boundaries:
      work_sentence = work_sentence.replace(delimiter,
        self._delimiter_token)
  self.tokens = [x.strip() for x in work_sentence.split(self._delimiter_token) if x != '']

## tokenization.py
def __iter__(self):
  return self
def __next__(self):
  if self._index < len(self.tokens):
    result = self.tokens[self._index]
    self._index+=1
    return result
  raise StopIteration

## structures.py
class Document:
    def __init__(self, document_text):
        self.raw = document_text
        self.sentences = sentencize(self.raw)
        self._index = 0

#[...]

class Sentence:
    def __init__(self, start_position, end_position, raw_document_reference):

## structures.py
#[...]
  def get(self):
      if self.SOS:
          return '<SOS>'
      elif self.EOS:
          return '<EOS>'
      else:
          return self._sentence_string[self.start_pos:self.end_pos]
# Displays the Token value in the terminal if the variable is called
  def __repr__(self):

## structures.py
DEFAULT_SENTENCE_BOUNDARIES = ['(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$)','\.{2,}','\!+','\:+','\?+']
"""
Breaking it down:
(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$) -> looks for ant period that is not preceded or succeded by a digit or other period.
This avoids the algorithm to split sentences at decimal numbers or reticences.
\.{2,} -> captures reticences.
\!+ -> captures series of exclamation points.
\:+ -> captures series of colons.
\?+ -> captures series of question marks.
"""

## structures.py
import re

#[...]

def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'):
    working_document = raw_input_document
    punctuation_patterns = sentence_boundaries
    for punct in punctuation_patterns:
        working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE)
    list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""]
	class DummySentencizer:
	def __init__(self, input_text, split_characters=['.','?','!',':'], delimiter_token='<SPLIT>'):
	self.sentences = []
	self.raw = str(input_text)
	self._split_characters=split_characters
	self._delimiter_token=delimiter_token
	self._index=0
	self._sentencize()
	def _sentencize(self):
	work_sentence = self.raw
	for character in self._split_characters:
	work_sentence = work_sentence.replace(character, character+""+self._delimiter_token)
	self.sentences = [x.strip() for x in work_sentence.split(self._delimiter_token) if x !='']
	def __iter__(self):
	return self
	def __next__(self):
	if self._index < len(self.sentences):
	result = self.sentences[self._index]
	self._index+=1
	return result
	raise StopIteration
	import string
	class DummyTokenizer:
	def __init__(self, sentence, token_boundaries=[' ', '-'],
	punctuations=string.punctuation, delimiter_token='<SPLIT>'):
	self.tokens = []
	self.raw = str(sentence)
	self._token_boundaries = token_boundaries
	self._delimiter_token = delimiter_token
	self._punctuations = punctuations
	self._index = 0
	def _tokenize(self):
	work_sentence = self.raw
	for punctuation in self._punctuations:
	work_sentence = work_sentence.replace(punctuation,
	" "+punctuation+" ")
	for delimiter in self._token_boundaries:
	work_sentence = work_sentence.replace(delimiter,
	self._delimiter_token)
	self.tokens = [x.strip() for x in work_sentence.split(self._delimiter_token) if x != '']
	class Document:
	def __init__(self, document_text):
	self.raw = document_text
	self.sentences = sentencize(self.raw)
	self._index = 0

	#[...]

	class Sentence:
	def __init__(self, start_position, end_position, raw_document_reference):
	#[...]
	def get(self):
	if self.SOS:
	return '<SOS>'
	elif self.EOS:
	return '<EOS>'
	else:
	return self._sentence_string[self.start_pos:self.end_pos]
	# Displays the Token value in the terminal if the variable is called
	def __repr__(self):
	DEFAULT_SENTENCE_BOUNDARIES = ['(?<=[0-9]\|[^0-9.])(\.)(?=[^0-9.]\|[^0-9.]\|[\s]\|$)','\.{2,}','\!+','\:+','\?+']
	"""
	Breaking it down:
	(?<=[0-9]\|[^0-9.])(\.)(?=[^0-9.]\|[^0-9.]\|[\s]\|$) -> looks for ant period that is not preceded or succeded by a digit or other period.
	This avoids the algorithm to split sentences at decimal numbers or reticences.
	\.{2,} -> captures reticences.
	\!+ -> captures series of exclamation points.
	\:+ -> captures series of colons.
	\?+ -> captures series of question marks.
	"""
	import re

	#[...]

	def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'):
	working_document = raw_input_document
	punctuation_patterns = sentence_boundaries
	for punct in punctuation_patterns:
	working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE)
	list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""]