Skip to content

Instantly share code, notes, and snippets.

View dsilvadeepal's full-sized avatar

Deepal DSilva dsilvadeepal

  • Salesforce
  • Atlanta, GA
View GitHub Profile
#Create model from the training dataset
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
#Make predictions on test set
sms_test_pred <- predict(sms_classifier, sms_test)
#Create confusion matrix
confusionMatrix(data = sms_test_pred, reference = sms_test_labels,
positive = "spam", dnn = c("Prediction", "Actual"))
convert_values <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2,
convert_values)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2,
convert_values)
threshold <- 0.1
min_freq = round(sms_dtm$nrow*(threshold/100),0)
min_freq
# Create vector of most frequent words
freq_words <- findFreqTerms(x = sms_dtm, lowfreq = min_freq)
str(freq_words)
#Training & Test set
sms_dtm_train <- sms_dtm[1:4457, ]
sms_dtm_test <- sms_dtm[4458:5572, ]
#Training & Test Label
sms_train_labels <- sms_raw[1:4457, ]$Tag
sms_test_labels <- sms_raw[4458:5572, ]$Tag
#Proportion for training & test labels
prop.table(table(sms_train_labels))
sms_corpus <- VCorpus(VectorSource(sms_raw$Msg))
sms_dtm <- DocumentTermMatrix(sms_corpus, control =
list(tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE))
dim(sms_dtm)
spam <- subset(sms_raw, Tag == "spam")
wordcloud(spam$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)
ham <- subset(sms_raw, Tag == "ham")
wordcloud(ham$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)
#View the first few lines of the dataset
head(sms_raw)
#Select & rename appropriate columns of the dataset
sms_raw <- sms_raw[, 1:2]
colnames(sms_raw) <- c("Tag", "Msg")
str(sms_raw)
#Find the proportions of junk vs legitimate sms messages
table(sms_raw$Tag)
@dsilvadeepal
dsilvadeepal / SpamClassify - Import.R
Last active August 14, 2018 19:27
Spam Classifier - Import data & libraries
#Import libraries
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(e1071) #For Naive Bayes
library(caret) #For the Confusion Matrix
#Import data
sms_raw <- read.csv("../input/spam.csv")
@dsilvadeepal
dsilvadeepal / Rvest tutorial - Part 1
Last active May 4, 2018 03:13
Extracting the Top 10 Pop Artists of All Time
#Identify the url from where you want to extract data
base_url <- "https://www.billboard.com/charts/greatest-of-all-time-pop-songs-artists"
webpage <- read_html(base_url)
# Get the artist name
artist <- html_nodes(webpage, ".chart-row__artist")
artist <- as.character(html_text(artist))
# Get the artist rank
rank <- html_nodes(webpage, ".chart-row__rank")
@dsilvadeepal
dsilvadeepal / Rvest tutorial - Part 2
Last active June 10, 2018 09:00
Extracting Popular Songs and Lyrics of the top 10 Artists
#Format the link to navigate to the artists genius webpage
genius_urls <- paste0("https://genius.com/artists/",top_artists$Artist)
#Initialize a tibble to store the results
artist_lyrics <- tibble()
# Outer loop to get the song links for each artist
for (i in 1:10) {
genius_page <- read_html(genius_urls[i])
song_links <- html_nodes(genius_page, ".mini_card_grid-song a") %>%