michelkana/truecasing_pos.py

## truecasing_pos.py
# packages needed
# !pip install nltk
# !pip install stanfordnlp
# !pip install --upgrade bleu
import nltk
from nltk.tokenize import sent_tokenize
import re
import stanfordnlp
from bleu import list_bleu

# init packages
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stanfordnlp.download('en')
stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')

# function for restoring capitalization
def truecasing(input_text):
    # split the text into sentences
    sentences = sent_tokenize(input_text, language='english')
    # capitalize the sentences
    sentences_capitalized = [s.capitalize() for s in sentences]
    # join the capitalized sentences
    text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
    # capitalize words according to part-of-speech tagging (POS)
    doc = stf_nlp(text_truecase)
    text_truecase =  ' '.join([w.text.capitalize() if w.upos in ["PROPN","NNS"] \
                                                   else w.text for sent in doc.sentences \
                               for w in sent.words])
    text_truecase = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text_truecase)
    return text_truecase

# demo
text = "i think that john stone is a nice guy. there is a stone on the grass. i'm fat. are you welcome and smart in london? is this martin's dog?"
truecasing(text)
	# packages needed
	# !pip install nltk
	# !pip install stanfordnlp
	# !pip install --upgrade bleu
	import nltk
	from nltk.tokenize import sent_tokenize
	import re
	import stanfordnlp
	from bleu import list_bleu

	# init packages
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	stanfordnlp.download('en')
	stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')

	# function for restoring capitalization
	def truecasing(input_text):
	# split the text into sentences
	sentences = sent_tokenize(input_text, language='english')
	# capitalize the sentences
	sentences_capitalized = [s.capitalize() for s in sentences]
	# join the capitalized sentences
	text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized))
	# capitalize words according to part-of-speech tagging (POS)
	doc = stf_nlp(text_truecase)
	text_truecase = ' '.join([w.text.capitalize() if w.upos in ["PROPN","NNS"] \
	else w.text for sent in doc.sentences \
	for w in sent.words])
	text_truecase = re.sub(r'\s([?.!"](?:\s\|$))', r'\1', text_truecase)
	return text_truecase

	# demo
	text = "i think that john stone is a nice guy. there is a stone on the grass. i'm fat. are you welcome and smart in london? is this martin's dog?"
	truecasing(text)