Last active
April 8, 2016 03:31
-
-
Save leebird/48dee4b1237c7f35781915144599a848 to your computer and use it in GitHub Desktop.
Split sentences using NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' Initialization: install NLTK python module and download data. | |
$ pip install nltk | |
$ echo 'import nltk; nltk.download("punkt")' | python | |
''' | |
from __future__ import print_function, unicode_literals | |
import nltk.data | |
_sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') | |
def split_sentence(text): | |
# Split text. | |
sentences = _sent_detector.tokenize(text) | |
# Find each sentence's offset. | |
needle = 0 | |
triples = [] | |
for sent in sentences: | |
start = text.find(sent, needle) | |
end = start + len(sent) - 1 | |
needle += len(sent) | |
triples.append((sent, start, end)) | |
# Return results | |
return triples | |
if __name__ == '__main__': | |
text = ('The down-regulation of miR-126 was more obvious in the patients ' | |
'who displayed bad prognosis (P=0.025). Over-expression of miR-126 ' | |
'in colon cancer cell was able to inhibit cell proliferation, ' | |
'promote cell apoptosis and reduce the invasive ability. MiR-126 ' | |
'significantly enhanced the sensitivity of the colon cancer cell to ' | |
'chemotherapeutic drug.') | |
print(split_sentence(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment