Created
July 15, 2020 07:57
-
-
Save yogeshnile/fa65dfc72a5b2dcba1ec3159d501ec23 to your computer and use it in GitHub Desktop.
SMS Spam Detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Get Sms Dataset | |
sms = pd.read_csv('Spam SMS Collection', sep='\t', names=['label','message']) | |
sms.drop_duplicates(inplace=True) | |
sms.reset_index(drop=True, inplace=True) | |
#Cleaning the messages | |
corpus = [] | |
ps = PorterStemmer() | |
for i in range(0,sms.shape[0]): | |
message = re.sub(pattern='[^a-zA-Z]', repl=' ', string=sms.message[i]) #Cleaning special character from the message | |
message = message.lower() #Converting the entire message into lower case | |
words = message.split() # Tokenizing the review by words | |
words = [word for word in words if word not in set(stopwords.words('english'))] #Removing the stop words | |
words = [ps.stem(word) for word in words] #Stemming the words | |
message = ' '.join(words) #Joining the stemmed words | |
corpus.append(message) #Building a corpus of messages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment