Last active
July 31, 2020 14:24
-
-
Save arunm8489/88c636ae90ced0d9e3d9b70b13ad6a97 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def decontracted(phrase): | |
# specific | |
phrase = re.sub(r"won't", "will not", phrase) | |
phrase = re.sub(r"can\'t", "can not", phrase) | |
# general | |
phrase = re.sub(r"n\'t", " not", phrase) | |
phrase = re.sub(r"\'re", " are", phrase) | |
phrase = re.sub(r"\'s", " is", phrase) | |
phrase = re.sub(r"\'d", " would", phrase) | |
phrase = re.sub(r"\'ll", " will", phrase) | |
phrase = re.sub(r"\'t", " not", phrase) | |
phrase = re.sub(r"\'ve", " have", phrase) | |
phrase = re.sub(r"\'m", " am", phrase) | |
return phrase | |
# we are removing the words from the stop words list: 'no', 'nor', 'not' | |
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\ | |
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \ | |
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\ | |
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \ | |
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \ | |
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \ | |
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\ | |
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\ | |
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\ | |
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \ | |
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \ | |
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\ | |
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\ | |
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \ | |
'won', "won't", 'wouldn', "wouldn't"] | |
from tqdm import tqdm | |
def preprocess_text(text_data): | |
preprocessed_text = [] | |
# tqdm is for printing the status bar | |
for sentance in tqdm(text_data): | |
sent = decontracted(sentance) | |
sent = sent.replace('\\r', ' ') | |
sent = sent.replace('\\n', ' ') | |
sent = sent.replace('\\"', ' ') | |
sent = re.sub('[^A-Za-z0-9]+', ' ', sent) | |
# https://gist.github.com/sebleier/554280 | |
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords) | |
preprocessed_text.append(sent.lower().strip()) | |
return preprocessed_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment