Created
June 19, 2021 15:34
-
-
Save michelkana/a99f4dd250e5ba5540f82b58701d7f9b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# packages needed | |
# !pip install nltk | |
# !pip install stanfordnlp | |
# !pip install --upgrade bleu | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
import re | |
import stanfordnlp | |
from bleu import list_bleu | |
# init packages | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
stanfordnlp.download('en') | |
stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos') | |
# function for restoring capitalization | |
def truecasing(input_text): | |
# split the text into sentences | |
sentences = sent_tokenize(input_text, language='english') | |
# capitalize the sentences | |
sentences_capitalized = [s.capitalize() for s in sentences] | |
# join the capitalized sentences | |
text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized)) | |
# capitalize words according to part-of-speech tagging (POS) | |
doc = stf_nlp(text_truecase) | |
text_truecase = ' '.join([w.text.capitalize() if w.upos in ["PROPN","NNS"] \ | |
else w.text for sent in doc.sentences \ | |
for w in sent.words]) | |
text_truecase = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text_truecase) | |
return text_truecase | |
# demo | |
text = "i think that john stone is a nice guy. there is a stone on the grass. i'm fat. are you welcome and smart in london? is this martin's dog?" | |
truecasing(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment