Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
def clean_corpus(text):
corpus = []
for i in range(len(text)):
tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
tweet = tweet.lower()
tweet = re.sub(r"can't","can not", tweet)
tweet = re.sub(r"hv","have", tweet)
tweet = re.sub(r"ur","your", tweet)
tweet = re.sub(r"ain't","is not", tweet)
tweet = re.sub(r"don't","do not", tweet)
tweet = re.sub(r"couldn't","could not", tweet)
tweet = re.sub(r"shouldn't","should not", tweet )
tweet = re.sub(r"won't","will not", tweet)
tweet = re.sub(r"there's", "there is", tweet)
tweet = re.sub(r"it's","it is", tweet)
tweet = re.sub(r"that's","that is", tweet)
tweet = re.sub(r"where's","where is", tweet)
tweet = re.sub(r"who's","who is", tweet)
tweet = re.sub(r"\W"," ", tweet)
tweet = re.sub(r"\d"," ", tweet)
tweet = re.sub(r"[ðâï¼½³ªãºæååçæåä¹µó¾_ëìêè]"," ", tweet)
tweet =re.sub(r"\s[a-z]\s"," ", tweet)
tweet = re.sub(r"\s+[a-z]\s+"," ", tweet)
tweet = re.sub(r"^[a-z]\s"," ", tweet)
tweet = re.sub(r"^[a-z]\s+"," ", tweet)
tweet = re.sub(r"\s+"," ", tweet)
tweet = re.sub(r"^\s","", tweet)
tweet = re.sub(r"\s$","", tweet)
corpus.append(tweet)
#return the corpus
return corpus
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment