Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
# function to preprocess speech
def clean(text):
# removing paragraph numbers
text = re.sub('[0-9]+.\t','',str(text))
# removing new line characters
text = re.sub('\n ','',str(text))
text = re.sub('\n',' ',str(text))
# removing apostrophes
text = re.sub("'s",'',str(text))
# removing hyphens
text = re.sub("-",' ',str(text))
text = re.sub("— ",'',str(text))
# removing quotation marks
text = re.sub('\"','',str(text))
# removing salutations
text = re.sub("Mr\.",'Mr',str(text))
text = re.sub("Mrs\.",'Mrs',str(text))
# removing any reference to outside text
text = re.sub("[\(\[].*?[\)\]]", "", str(text))
return text
# preprocessing speeches
df['Speech_clean'] = df['Speech'].apply(clean)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment