Skip to content

Instantly share code, notes, and snippets.

@Abhayparashar31
Created November 10, 2022 12:08
Show Gist options
  • Save Abhayparashar31/81997c2e2268338809c46a220d08649f to your computer and use it in GitHub Desktop.
Save Abhayparashar31/81997c2e2268338809c46a220d08649f to your computer and use it in GitHub Desktop.
def clean_text(text):
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordLemm = WordNetLemmatizer()
EMOJIS = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
'@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
'<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
URLPATTERN = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
USERPATTERN = '@[^\s]+'
SEQPATTERN = r"(.)\1\1+"
SEQREPLACE = r"\1\1"
import re
## lower casing
text = text.lower()
### Replacing URL
text = re.sub(URLPATTERN,' ',text)
### Replacing EMOJI
for emoji in EMOJIS.keys():
text = text.replace(emoji, " " + EMOJIS[emoji])
### Replacing USER pattern
text = re.sub(USERPATTERN,' ',text)
### Removing non-alphabets
text = re.sub('[^a-zA-z]'," ",text)
### Removing consecutive letters
text = re.sub(SEQPATTERN,SEQREPLACE,text)
text = text.split()
text = [wordLemm.lemmatize(word) for word in text if not word in stopwords.words('english') and len(word) > 1]
text = ' '.join(text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment