Skip to content

Instantly share code, notes, and snippets.

@michelkana
Created June 19, 2021 15:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michelkana/a99f4dd250e5ba5540f82b58701d7f9b to your computer and use it in GitHub Desktop.
Save michelkana/a99f4dd250e5ba5540f82b58701d7f9b to your computer and use it in GitHub Desktop.
# packages needed
# !pip install nltk
# !pip install stanfordnlp
# !pip install --upgrade bleu
import nltk
from nltk.tokenize import sent_tokenize
import re
import stanfordnlp
from bleu import list_bleu
# init packages
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stanfordnlp.download('en')
stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')
# function for restoring capitalization
def truecasing(input_text):
# split the text into sentences
sentences = sent_tokenize(input_text, language='english')
# capitalize the sentences
sentences_capitalized = [s.capitalize() for s in sentences]
# join the capitalized sentences
text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
# capitalize words according to part-of-speech tagging (POS)
doc = stf_nlp(text_truecase)
text_truecase = ' '.join([w.text.capitalize() if w.upos in ["PROPN","NNS"] \
else w.text for sent in doc.sentences \
for w in sent.words])
text_truecase = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text_truecase)
return text_truecase
# demo
text = "i think that john stone is a nice guy. there is a stone on the grass. i'm fat. are you welcome and smart in london? is this martin's dog?"
truecasing(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment