leebird/split_sent.py

## split_sent.py
''' Initialization: install NLTK python module and download data.
$ pip install nltk
$ echo 'import nltk; nltk.download("punkt")' | python
'''

from __future__ import print_function, unicode_literals
import nltk.data

_sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def split_sentence(text):
  # Split text.
  sentences = _sent_detector.tokenize(text)
  # Find each sentence's offset.
  needle = 0
  triples = []
  for sent in sentences:
      start = text.find(sent, needle)
      end = start + len(sent) - 1
      needle += len(sent)
      triples.append((sent, start, end))
  # Return results
  return triples

if __name__ == '__main__':
  text = ('The down-regulation of miR-126 was more obvious in the patients '
          'who displayed bad prognosis (P=0.025). Over-expression of miR-126 '
          'in colon cancer cell was able to inhibit cell proliferation, '
          'promote cell apoptosis and reduce the invasive ability. MiR-126 '
          'significantly enhanced the sensitivity of the colon cancer cell to '
          'chemotherapeutic drug.')
  print(split_sentence(text))
	''' Initialization: install NLTK python module and download data.
	$ pip install nltk
	$ echo 'import nltk; nltk.download("punkt")' \| python
	'''

	from __future__ import print_function, unicode_literals
	import nltk.data

	_sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

	def split_sentence(text):
	# Split text.
	sentences = _sent_detector.tokenize(text)
	# Find each sentence's offset.
	needle = 0
	triples = []
	for sent in sentences:
	start = text.find(sent, needle)
	end = start + len(sent) - 1
	needle += len(sent)
	triples.append((sent, start, end))
	# Return results
	return triples

	if __name__ == '__main__':
	text = ('The down-regulation of miR-126 was more obvious in the patients '
	'who displayed bad prognosis (P=0.025). Over-expression of miR-126 '
	'in colon cancer cell was able to inhibit cell proliferation, '
	'promote cell apoptosis and reduce the invasive ability. MiR-126 '
	'significantly enhanced the sensitivity of the colon cancer cell to '
	'chemotherapeutic drug.')
	print(split_sentence(text))