Skip to content

Instantly share code, notes, and snippets.

@aniruddha27
Created June 5, 2020 20:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aniruddha27/239b60c8da20288083753b3c6c2ed9c6 to your computer and use it in GitHub Desktop.
Save aniruddha27/239b60c8da20288083753b3c6c2ed9c6 to your computer and use it in GitHub Desktop.
# function to preprocess speech
def clean(text):
# removing paragraph numbers
text = re.sub('[0-9]+.\t','',str(text))
# removing new line characters
text = re.sub('\n ','',str(text))
text = re.sub('\n',' ',str(text))
# removing apostrophes
text = re.sub("'s",'',str(text))
# removing hyphens
text = re.sub("-",' ',str(text))
text = re.sub("— ",'',str(text))
# removing quotation marks
text = re.sub('\"','',str(text))
# removing salutations
text = re.sub("Mr\.",'Mr',str(text))
text = re.sub("Mrs\.",'Mrs',str(text))
# removing any reference to outside text
text = re.sub("[\(\[].*?[\)\]]", "", str(text))
return text
# preprocessing speeches
df['Speech_clean'] = df['Speech'].apply(clean)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment