Skip to content

Instantly share code, notes, and snippets.

@HarshSingh16
Created February 14, 2019 09:58
Show Gist options
  • Save HarshSingh16/ed9eb39d8e633dbf41c5f828b6d192f6 to your computer and use it in GitHub Desktop.
Save HarshSingh16/ed9eb39d8e633dbf41c5f828b6d192f6 to your computer and use it in GitHub Desktop.
# Doing a first cleaning of the texts
def clean_text(text):
text = text.lower()
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "cannot", text)
text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
return text
# Cleaning the questions
clean_questions = []
for question in questions:
clean_questions.append(clean_text(question))
# Cleaning the answers
clean_answers = []
for answer in answers:
clean_answers.append(clean_text(answer))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment