aniruddha27/nlp_ie_6.py

## nlp_ie_6.py
# function to preprocess speech
def clean(text):

    # removing paragraph numbers
    text = re.sub('[0-9]+.\t','',str(text))
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing salutations
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    # removing any reference to outside text
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))

    return text

# preprocessing speeches
df['Speech_clean'] = df['Speech'].apply(clean)
	# function to preprocess speech
	def clean(text):

	# removing paragraph numbers
	text = re.sub('[0-9]+.\t','',str(text))
	# removing new line characters
	text = re.sub('\n ','',str(text))
	text = re.sub('\n',' ',str(text))
	# removing apostrophes
	text = re.sub("'s",'',str(text))
	# removing hyphens
	text = re.sub("-",' ',str(text))
	text = re.sub("— ",'',str(text))
	# removing quotation marks
	text = re.sub('\"','',str(text))
	# removing salutations
	text = re.sub("Mr\.",'Mr',str(text))
	text = re.sub("Mrs\.",'Mrs',str(text))
	# removing any reference to outside text
	text = re.sub("[\(\[].*?[\)\]]", "", str(text))

	return text

	# preprocessing speeches
	df['Speech_clean'] = df['Speech'].apply(clean)