Deepal DSilva dsilvadeepal

## SpamClassify - NBModel.R
#Create model from the training dataset
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

#Make predictions on test set
sms_test_pred <- predict(sms_classifier, sms_test)

#Create confusion matrix
confusionMatrix(data = sms_test_pred, reference = sms_test_labels,
                positive = "spam", dnn = c("Prediction", "Actual"))

## SpamClassify - CategoricalValues.R
convert_values <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2,
                   convert_values)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2,
                  convert_values)

## SpamClassify - IndicatorFeatures.R
threshold <- 0.1

min_freq = round(sms_dtm$nrow*(threshold/100),0)

min_freq

# Create vector of most frequent words
freq_words <- findFreqTerms(x = sms_dtm, lowfreq = min_freq)

str(freq_words)

## SpamClassify - SplitDataset.R
#Training & Test set
sms_dtm_train <- sms_dtm[1:4457, ]
sms_dtm_test <- sms_dtm[4458:5572, ]

#Training & Test Label
sms_train_labels <- sms_raw[1:4457, ]$Tag
sms_test_labels <- sms_raw[4458:5572, ]$Tag

#Proportion for training & test labels
prop.table(table(sms_train_labels))

## SpamClassify - Cleansing.R
sms_corpus <- VCorpus(VectorSource(sms_raw$Msg))

sms_dtm <- DocumentTermMatrix(sms_corpus, control =
                                 list(tolower = TRUE,
                                      removeNumbers = TRUE,
                                      stopwords = TRUE,
                                      removePunctuation = TRUE,
                                      stemming = TRUE))

dim(sms_dtm)

## SpamClassify - WordCloud.R
spam <- subset(sms_raw, Tag == "spam")
wordcloud(spam$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)


ham <- subset(sms_raw, Tag == "ham")
wordcloud(ham$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)

## SpamClassify - Exploration.R
#View the first few lines of the dataset
head(sms_raw)

#Select & rename appropriate columns of the dataset
sms_raw <- sms_raw[, 1:2]
colnames(sms_raw) <- c("Tag", "Msg")
str(sms_raw)

#Find the proportions of junk vs legitimate sms messages
table(sms_raw$Tag)

## SpamClassify - Import.R
#Import libraries
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(e1071)         #For Naive Bayes
library(caret)         #For the Confusion Matrix

#Import data
sms_raw <- read.csv("../input/spam.csv")

## Rvest tutorial - Part 1
#Identify the url from where you want to extract data
base_url <- "https://www.billboard.com/charts/greatest-of-all-time-pop-songs-artists"
webpage <- read_html(base_url)

# Get the artist name
artist <- html_nodes(webpage, ".chart-row__artist")
artist <- as.character(html_text(artist))

# Get the artist rank
rank <- html_nodes(webpage, ".chart-row__rank")

## Rvest tutorial - Part 2
#Format the link to navigate to the artists genius webpage
genius_urls <- paste0("https://genius.com/artists/",top_artists$Artist)

#Initialize a tibble to store the results
artist_lyrics <- tibble()

# Outer loop to get the song links for each artist
for (i in 1:10) {
  genius_page <- read_html(genius_urls[i])
  song_links <- html_nodes(genius_page, ".mini_card_grid-song a") %>%
	#Create model from the training dataset
	sms_classifier <- naiveBayes(sms_train, sms_train_labels)

	#Make predictions on test set
	sms_test_pred <- predict(sms_classifier, sms_test)

	#Create confusion matrix
	confusionMatrix(data = sms_test_pred, reference = sms_test_labels,
	positive = "spam", dnn = c("Prediction", "Actual"))
	convert_values <- function(x) {
	x <- ifelse(x > 0, "Yes", "No")
	}

	sms_train <- apply(sms_dtm_freq_train, MARGIN = 2,
	convert_values)
	sms_test <- apply(sms_dtm_freq_test, MARGIN = 2,
	convert_values)
	threshold <- 0.1

	min_freq = round(sms_dtm$nrow*(threshold/100),0)

	min_freq

	# Create vector of most frequent words
	freq_words <- findFreqTerms(x = sms_dtm, lowfreq = min_freq)

	str(freq_words)
	#Training & Test set
	sms_dtm_train <- sms_dtm[1:4457, ]
	sms_dtm_test <- sms_dtm[4458:5572, ]

	#Training & Test Label
	sms_train_labels <- sms_raw[1:4457, ]$Tag
	sms_test_labels <- sms_raw[4458:5572, ]$Tag

	#Proportion for training & test labels
	prop.table(table(sms_train_labels))
	sms_corpus <- VCorpus(VectorSource(sms_raw$Msg))

	sms_dtm <- DocumentTermMatrix(sms_corpus, control =
	list(tolower = TRUE,
	removeNumbers = TRUE,
	stopwords = TRUE,
	removePunctuation = TRUE,
	stemming = TRUE))

	dim(sms_dtm)
	spam <- subset(sms_raw, Tag == "spam")
	wordcloud(spam$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)


	ham <- subset(sms_raw, Tag == "ham")
	wordcloud(ham$Msg, max.words = 60, colors = brewer.pal(7, "Paired"), random.order = FALSE)
	#View the first few lines of the dataset
	head(sms_raw)

	#Select & rename appropriate columns of the dataset
	sms_raw <- sms_raw[, 1:2]
	colnames(sms_raw) <- c("Tag", "Msg")
	str(sms_raw)

	#Find the proportions of junk vs legitimate sms messages
	table(sms_raw$Tag)
	#Import libraries
	library(tm)
	library(SnowballC)
	library(wordcloud)
	library(RColorBrewer)
	library(e1071) #For Naive Bayes
	library(caret) #For the Confusion Matrix

	#Import data
	sms_raw <- read.csv("../input/spam.csv")
	#Identify the url from where you want to extract data
	base_url <- "https://www.billboard.com/charts/greatest-of-all-time-pop-songs-artists"
	webpage <- read_html(base_url)

	# Get the artist name
	artist <- html_nodes(webpage, ".chart-row__artist")
	artist <- as.character(html_text(artist))

	# Get the artist rank
	rank <- html_nodes(webpage, ".chart-row__rank")
	#Format the link to navigate to the artists genius webpage
	genius_urls <- paste0("https://genius.com/artists/",top_artists$Artist)

	#Initialize a tibble to store the results
	artist_lyrics <- tibble()

	# Outer loop to get the song links for each artist
	for (i in 1:10) {
	genius_page <- read_html(genius_urls[i])
	song_links <- html_nodes(genius_page, ".mini_card_grid-song a") %>%