This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Create model from the training dataset | |
sms_classifier <- naiveBayes(sms_train, sms_train_labels) | |
#Make predictions on test set | |
sms_test_pred <- predict(sms_classifier, sms_test) | |
#Create confusion matrix | |
confusionMatrix(data = sms_test_pred, reference = sms_test_labels, | |
positive = "spam", dnn = c("Prediction", "Actual")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
convert_values <- function(x) { | |
x <- ifelse(x > 0, "Yes", "No") | |
} | |
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, | |
convert_values) | |
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, | |
convert_values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
threshold <- 0.1 | |
min_freq = round(sms_dtm$nrow*(threshold/100),0) | |
min_freq | |
# Create vector of most frequent words | |
freq_words <- findFreqTerms(x = sms_dtm, lowfreq = min_freq) | |
str(freq_words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Training & Test set | |
sms_dtm_train <- sms_dtm[1:4457, ] | |
sms_dtm_test <- sms_dtm[4458:5572, ] | |
#Training & Test Label | |
sms_train_labels <- sms_raw[1:4457, ]$Tag | |
sms_test_labels <- sms_raw[4458:5572, ]$Tag | |
#Proportion for training & test labels | |
prop.table(table(sms_train_labels)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sms_corpus <- VCorpus(VectorSource(sms_raw$Msg)) | |
sms_dtm <- DocumentTermMatrix(sms_corpus, control = | |
list(tolower = TRUE, | |
removeNumbers = TRUE, | |
stopwords = TRUE, | |
removePunctuation = TRUE, | |
stemming = TRUE)) | |
dim(sms_dtm) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spam <- subset(sms_raw, Tag == "spam") | |
wordcloud(spam$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE) | |
ham <- subset(sms_raw, Tag == "ham") | |
wordcloud(ham$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#View the first few lines of the dataset | |
head(sms_raw) | |
#Select & rename appropriate columns of the dataset | |
sms_raw <- sms_raw[, 1:2] | |
colnames(sms_raw) <- c("Tag", "Msg") | |
str(sms_raw) | |
#Find the proportions of junk vs legitimate sms messages | |
table(sms_raw$Tag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import libraries | |
library(tm) | |
library(SnowballC) | |
library(wordcloud) | |
library(RColorBrewer) | |
library(e1071) #For Naive Bayes | |
library(caret) #For the Confusion Matrix | |
#Import data | |
sms_raw <- read.csv("../input/spam.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Identify the url from where you want to extract data | |
base_url <- "https://www.billboard.com/charts/greatest-of-all-time-pop-songs-artists" | |
webpage <- read_html(base_url) | |
# Get the artist name | |
artist <- html_nodes(webpage, ".chart-row__artist") | |
artist <- as.character(html_text(artist)) | |
# Get the artist rank | |
rank <- html_nodes(webpage, ".chart-row__rank") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Format the link to navigate to the artists genius webpage | |
genius_urls <- paste0("https://genius.com/artists/",top_artists$Artist) | |
#Initialize a tibble to store the results | |
artist_lyrics <- tibble() | |
# Outer loop to get the song links for each artist | |
for (i in 1:10) { | |
genius_page <- read_html(genius_urls[i]) | |
song_links <- html_nodes(genius_page, ".mini_card_grid-song a") %>% |
NewerOlder