Skip to content

Instantly share code, notes, and snippets.

@frenzy2106
Last active March 18, 2020 09:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frenzy2106/8b30f699f09d4bc0797cde31c9afa1f1 to your computer and use it in GitHub Desktop.
Save frenzy2106/8b30f699f09d4bc0797cde31c9afa1f1 to your computer and use it in GitHub Desktop.
def clean_corpus(text):
corpus = []
for i in range(len(text)):
tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
tweet = tweet.lower()
tweet = re.sub(r"can't","can not", tweet)
tweet = re.sub(r"hv","have", tweet)
tweet = re.sub(r"ur","your", tweet)
tweet = re.sub(r"ain't","is not", tweet)
tweet = re.sub(r"don't","do not", tweet)
tweet = re.sub(r"couldn't","could not", tweet)
tweet = re.sub(r"shouldn't","should not", tweet )
tweet = re.sub(r"won't","will not", tweet)
tweet = re.sub(r"there's", "there is", tweet)
tweet = re.sub(r"it's","it is", tweet)
tweet = re.sub(r"that's","that is", tweet)
tweet = re.sub(r"where's","where is", tweet)
tweet = re.sub(r"who's","who is", tweet)
tweet = re.sub(r"\W"," ", tweet)
tweet = re.sub(r"\d"," ", tweet)
tweet = re.sub(r"[ðâï¼½³ªãºæååçæåä¹µó¾_ëìêè]"," ", tweet)
tweet =re.sub(r"\s[a-z]\s"," ", tweet)
tweet = re.sub(r"\s+[a-z]\s+"," ", tweet)
tweet = re.sub(r"^[a-z]\s"," ", tweet)
tweet = re.sub(r"^[a-z]\s+"," ", tweet)
tweet = re.sub(r"\s+"," ", tweet)
tweet = re.sub(r"^\s","", tweet)
tweet = re.sub(r"\s$","", tweet)
corpus.append(tweet)
#return the corpus
return corpus
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment