cosmin-ticu/DE3_AWS_Homework_Script.r Secret

## DE3_AWS_Homework_Script.r
# Set up your R w/ AWS ----------------------------------------------------

keyTable <- read.csv("accessKeys.csv", header = T)
# accessKeys.csv == the CSV downloaded from AWS containing your Access & Secret keys

AWS_ACCESS_KEY_ID <- as.character(keyTable$Access.key.ID)
AWS_SECRET_ACCESS_KEY <- as.character(keyTable$Secret.access.key)

#activate
Sys.setenv("AWS_ACCESS_KEY_ID" = AWS_ACCESS_KEY_ID,
           "AWS_SECRET_ACCESS_KEY" = AWS_SECRET_ACCESS_KEY,
           "AWS_DEFAULT_REGION" = "eu-west-1")

library(rvest)
library(data.table)
library(dplyr)
library(tidyverse)

# get_one_page_from_usr <- function(my_url_usr) {
#
#   t <- read_html(my_url_usr)
#
#   boxes <- t %>% html_nodes('.sg_news')
#
#   img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
#   box_dfs <- lapply(boxes, function(x){
#
#     tl <- list()
#
#     tl[['timestamp']] <- x %>% html_nodes('.date') %>% html_text()
#     tl[['link_pic']] <- x %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
#     return(tl)
#
#   })
#
#   df <- rbindlist(box_dfs, fill = T)
#   return(img_url)
#
# }

# download.file(paste0(gigi$link_pic[1]), destfile = "test.png", mode = 'wb')
#
# counter <- 1
# for (i in gigi) {
#   download.file(gigi[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
#   counter <- counter + 1
# }


# working image downloader ------------------------------------------------------

# my_url_usr <- 'https://www.usrplus.ro/noutati?page=1'

# get_pic_links_from_usr <- function(my_url_usr) {
#
#   t <- read_html(my_url_usr)
#
#   boxes <- t %>% html_nodes('.sg_news')
#
#   img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
#   counter <- 1
#   for (i in img_url) {
#     download.file(img_url[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
#     counter <- counter + 1
#   }
#
#   return(img_url)
#
# }
#
# test <- get_pic_links_from_usr(my_url_usr)


# USR iterative text & content scraper -----------------------------------------

get_article_from_usr <- function(pages_to_download) {

  count <- 1
  counter <- 1

  links_to_get <- paste0('https://www.usrplus.ro/noutati?page=',
                         1:pages_to_download)

  ret_df <- rbindlist(lapply(links_to_get, function(my_url_usr){

    t <- read_html(my_url_usr)

    boxes <- t %>% html_nodes('.sg_news')

    img_url <- boxes %>% html_nodes('.imgWrap') %>%
      html_nodes('a') %>%
      html_nodes('img') %>%
      html_attr('src')

    img_url[img_url==''] <- NA

    for (i in img_url) {
      if(!is.na(img_url[counter])){
        download.file(img_url[counter],
                      destfile = paste0("usr/usr",
                                        count,'.png'),
                      mode = 'wb')
      }
      counter <- counter + 1
      count <<- count + 1
    }

    box_dfs <- lapply(boxes, function(x){

      tl <- list()

      tl[['timestamp']] <- x %>% html_nodes('.date') %>%
        html_text() %>%
        trimws()
      tl[['title']] <- x %>% html_nodes('h5') %>%
        html_nodes('a') %>%
        html_text()
      tl[['link']] <- paste0('https://www.usrplus.ro', x %>%
                               html_nodes('h5') %>%
                               html_nodes('a') %>%
                               html_attr('href'))

      return(tl)

    })

    df <- rbindlist(box_dfs, fill = T)
    return(df)

  }))

  return(ret_df)

}

USR <- get_article_from_usr(5)

# PSD working iterative scraper ---------------------------------------

# my_url_psd <- 'https://www.psd.ro/stiri/page/4'

get_article_from_psd <- function(pages_to_download) {

  count <- 1
  counter <- 1

  links_to_get <- paste0('https://www.psd.ro/stiri/page/',1:pages_to_download)

  ret_df <- rbindlist(lapply(links_to_get, function(my_url_psd){

    t <- read_html(my_url_psd)

    boxes <- t %>% html_nodes('.col-md-4')

    img_url <- as.character(boxes %>% html_nodes('.article__featured-image') %>% html_attr('src'))

    img_url[img_url==''] <- NA

    for (i in img_url) {
      if(!is.na(img_url[counter])){
        download.file(img_url[counter], destfile = paste0("psd/psd", count,'.png'), mode = 'wb')
      }
      counter <- counter + 1
      count <<- count + 1
    }

    box_dfs <- lapply(boxes, function(x){

      tl <- list()

      tl[['timestamp']] <- x %>% html_nodes('.article__date') %>% html_text() %>% trimws()
      tl[['title']] <- x %>% html_nodes('.article__title') %>% html_text()
      tl[['link']] <- x %>% html_nodes('.article-link') %>% html_attr('href')

      return(tl)

    })

    df <- rbindlist(box_dfs, fill = T)
    return(df)

  }))

  return(ret_df)

}

PSD <- get_article_from_psd(4)


# Load up into S3 ---------------------------------------------------------

library(aws.s3)

s3sync(path = 'usr',
       bucket = 'cosmin-ceu-2020',
       direction = 'upload',
       verbose = T,
       recursive = T)
s3sync(path = 'psd',
       bucket = 'cosmin-ceu-2020',
       direction = 'upload',
       verbose = T,
       recursive = T)


# Amazon Rekognition PSD ------------------------------------------------

library(paws.machine.learning)

get_labels_psd <- function(amount = 5) {

svc <- paws.machine.learning::rekognition()

picture <- paste0('psd',1:amount,'.png')

ret_df <- rbindlist(lapply(picture, function(x){

  gicu <- svc$detect_labels(list(
    S3Object = list(
      Bucket = "cosmin-ceu-2020",
      Name = x
    )), MaxLabels = 5)

  df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
  return(df)

}))

out_df <- ret_df %>% group_by(Name) %>% summarise(PSD_Confidence = mean(Confidence))
return(out_df)

}

PSD_picture_labels <- get_labels_psd(9)

# keys <- colnames(test)[!grepl('Confidence',colnames(test))]
# X <- as.data.table(test)
# X[,list(mm= mean(Confidence)),keys]


# Amazon Rekognition USR ---------------------------------------------------

get_labels_usr <- function(amount = 5) {

  svc <- paws.machine.learning::rekognition()

  picture <- paste0('usr',1:amount,'.png')

  ret_df <- rbindlist(lapply(picture, function(x){

    gicu <- svc$detect_labels(list(
      S3Object = list(
        Bucket = "cosmin-ceu-2020",
        Name = x
      )), MaxLabels = 5)

    df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
    return(df)

}))

  out_df <- ret_df %>% group_by(Name) %>% summarise(USR_Confidence = mean(Confidence))
  return(out_df)

}

USR_picture_labels <- get_labels_usr(9)


# Join USR & PSD dfs and analysis -----------------------------------------

trial <- full_join(USR_picture_labels,PSD_picture_labels)

write.csv(trial, "trial.txt")


# Translate USR & Analyze article titles ------------------------------------------------

library(aws.translate)
USR_titles <- USR$title

x <- 1
USR_translations <- NULL
for (i in 1:50) {
  USR_translations <- c(USR_translations,
                        translate(USR_titles[x],
                                  from = "auto", to = "en"))
  x <- x + 1
}

USR$trans_title <- USR_translations

write.csv(USR, "scraped_USR.csv", fileEncoding = 'UTF-8')

# USR Sentiment Analysis

library(aws.comprehend)
x <- 1
USR_sentiment <- NULL
for (i in 1:50) {
  USR_sentiment <- rbind(USR_sentiment,
                         detect_sentiment(USR_translations[x]))
  x <- x + 1
}

# Label instances in dataframe as USR for joining later
USR_sentiment <- mutate(USR_sentiment, 'Political_Party'='USR')

# Translate & Analyze PSD article titles --------------------------------------------

library(aws.translate)
PSD_titles <- PSD$title

x <- 1
PSD_translations <- NULL
for (i in 1:48) {
  PSD_translations <- c(PSD_translations, translate(PSD_titles[x], from = "auto", to = "en"))
  x <- x + 1
}

PSD$trans_title <- PSD_translations

# PSD Sentiment Analysis

library(aws.comprehend)
x <- 1
PSD_sentiment <- NULL
for (i in 1:48) {
  PSD_sentiment <- rbind(PSD_sentiment, detect_sentiment(PSD_translations[x]))
  x <- x + 1
}
PSD_sentiment <- mutate(PSD_sentiment, 'Political_Party'='PSD')


# Merged sentiment analysis -----------------------------------------------

PSD_USR_sentiment <-
  rbind(PSD_sentiment,USR_sentiment) %>%
  subset(select = -Index)

write.csv(PSD_USR_sentiment, "sentiment.csv")

library(ggplot2)
library(ggthemes)
PSD_USR_sentiment %>%
  keep(is.numeric) %>%
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~key, scales = "free") +
  geom_histogram(fill= "black", col= "salmon")+
  theme_bw() +
  scale_fill_wsj()

PSD_USR_sentiment$Political_Party <- as.factor(PSD_USR_sentiment$Political_Party)

ggplot(PSD_USR_sentiment, aes(Positive, Negative, color = Political_Party))+
  geom_point(size = 5)+
  theme_bw() +
  scale_fill_wsj()

PSD_sentiment_summary <- subset(PSD_sentiment, select = -c(Index, Political_Party, Sentiment)) %>%
  summarise(mean(Positive), mean(Neutral), mean(Negative), mean(Mixed)) %>%
  t() %>%
  rbind.data.frame() %>%
  add_column(c('Positive','Neutral','Negative','Mixed'))

colnames(PSD_sentiment_summary) <- c('Weight','Sentiment')

ggplot(PSD_sentiment_summary, aes(y = reorder(Sentiment, Weight), x = Weight))+
  geom_col(fill = "orangered4")+
  labs(y = 'Sentiment')+
  theme_bw() +
  scale_fill_wsj()
	# Set up your R w/ AWS ----------------------------------------------------

	keyTable <- read.csv("accessKeys.csv", header = T)
	# accessKeys.csv == the CSV downloaded from AWS containing your Access & Secret keys

	AWS_ACCESS_KEY_ID <- as.character(keyTable$Access.key.ID)
	AWS_SECRET_ACCESS_KEY <- as.character(keyTable$Secret.access.key)

	#activate
	Sys.setenv("AWS_ACCESS_KEY_ID" = AWS_ACCESS_KEY_ID,
	"AWS_SECRET_ACCESS_KEY" = AWS_SECRET_ACCESS_KEY,
	"AWS_DEFAULT_REGION" = "eu-west-1")

	library(rvest)
	library(data.table)
	library(dplyr)
	library(tidyverse)

	# get_one_page_from_usr <- function(my_url_usr) {
	#
	# t <- read_html(my_url_usr)
	#
	# boxes <- t %>% html_nodes('.sg_news')
	#
	# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
	#
	# box_dfs <- lapply(boxes, function(x){
	#
	# tl <- list()
	#
	# tl[['timestamp']] <- x %>% html_nodes('.date') %>% html_text()
	# tl[['link_pic']] <- x %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
	#
	# return(tl)
	#
	# })
	#
	# df <- rbindlist(box_dfs, fill = T)
	# return(img_url)
	#
	# }

	# download.file(paste0(gigi$link_pic[1]), destfile = "test.png", mode = 'wb')
	#
	# counter <- 1
	# for (i in gigi) {
	# download.file(gigi[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
	# counter <- counter + 1
	# }


	# working image downloader ------------------------------------------------------

	# my_url_usr <- 'https://www.usrplus.ro/noutati?page=1'

	# get_pic_links_from_usr <- function(my_url_usr) {
	#
	# t <- read_html(my_url_usr)
	#
	# boxes <- t %>% html_nodes('.sg_news')
	#
	# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
	#
	# counter <- 1
	# for (i in img_url) {
	# download.file(img_url[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
	# counter <- counter + 1
	# }
	#
	# return(img_url)
	#
	# }
	#
	# test <- get_pic_links_from_usr(my_url_usr)


	# USR iterative text & content scraper -----------------------------------------

	get_article_from_usr <- function(pages_to_download) {

	count <- 1
	counter <- 1

	links_to_get <- paste0('https://www.usrplus.ro/noutati?page=',
	1:pages_to_download)

	ret_df <- rbindlist(lapply(links_to_get, function(my_url_usr){

	t <- read_html(my_url_usr)

	boxes <- t %>% html_nodes('.sg_news')

	img_url <- boxes %>% html_nodes('.imgWrap') %>%
	html_nodes('a') %>%
	html_nodes('img') %>%
	html_attr('src')

	img_url[img_url==''] <- NA

	for (i in img_url) {
	if(!is.na(img_url[counter])){
	download.file(img_url[counter],
	destfile = paste0("usr/usr",
	count,'.png'),
	mode = 'wb')
	}
	counter <- counter + 1
	count <<- count + 1
	}

	box_dfs <- lapply(boxes, function(x){

	tl <- list()

	tl[['timestamp']] <- x %>% html_nodes('.date') %>%
	html_text() %>%
	trimws()
	tl[['title']] <- x %>% html_nodes('h5') %>%
	html_nodes('a') %>%
	html_text()
	tl[['link']] <- paste0('https://www.usrplus.ro', x %>%
	html_nodes('h5') %>%
	html_nodes('a') %>%
	html_attr('href'))

	return(tl)

	})

	df <- rbindlist(box_dfs, fill = T)
	return(df)

	}))

	return(ret_df)

	}

	USR <- get_article_from_usr(5)

	# PSD working iterative scraper ---------------------------------------

	# my_url_psd <- 'https://www.psd.ro/stiri/page/4'

	get_article_from_psd <- function(pages_to_download) {

	count <- 1
	counter <- 1

	links_to_get <- paste0('https://www.psd.ro/stiri/page/',1:pages_to_download)

	ret_df <- rbindlist(lapply(links_to_get, function(my_url_psd){

	t <- read_html(my_url_psd)

	boxes <- t %>% html_nodes('.col-md-4')

	img_url <- as.character(boxes %>% html_nodes('.article__featured-image') %>% html_attr('src'))

	img_url[img_url==''] <- NA

	for (i in img_url) {
	if(!is.na(img_url[counter])){
	download.file(img_url[counter], destfile = paste0("psd/psd", count,'.png'), mode = 'wb')
	}
	counter <- counter + 1
	count <<- count + 1
	}

	box_dfs <- lapply(boxes, function(x){

	tl <- list()

	tl[['timestamp']] <- x %>% html_nodes('.article__date') %>% html_text() %>% trimws()
	tl[['title']] <- x %>% html_nodes('.article__title') %>% html_text()
	tl[['link']] <- x %>% html_nodes('.article-link') %>% html_attr('href')

	return(tl)

	})

	df <- rbindlist(box_dfs, fill = T)
	return(df)

	}))

	return(ret_df)

	}

	PSD <- get_article_from_psd(4)


	# Load up into S3 ---------------------------------------------------------

	library(aws.s3)

	s3sync(path = 'usr',
	bucket = 'cosmin-ceu-2020',
	direction = 'upload',
	verbose = T,
	recursive = T)
	s3sync(path = 'psd',
	bucket = 'cosmin-ceu-2020',
	direction = 'upload',
	verbose = T,
	recursive = T)


	# Amazon Rekognition PSD ------------------------------------------------

	library(paws.machine.learning)

	get_labels_psd <- function(amount = 5) {

	svc <- paws.machine.learning::rekognition()

	picture <- paste0('psd',1:amount,'.png')

	ret_df <- rbindlist(lapply(picture, function(x){

	gicu <- svc$detect_labels(list(
	S3Object = list(
	Bucket = "cosmin-ceu-2020",
	Name = x
	)), MaxLabels = 5)

	df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
	return(df)

	}))

	out_df <- ret_df %>% group_by(Name) %>% summarise(PSD_Confidence = mean(Confidence))
	return(out_df)

	}

	PSD_picture_labels <- get_labels_psd(9)

	# keys <- colnames(test)[!grepl('Confidence',colnames(test))]
	# X <- as.data.table(test)
	# X[,list(mm= mean(Confidence)),keys]


	# Amazon Rekognition USR ---------------------------------------------------

	get_labels_usr <- function(amount = 5) {

	svc <- paws.machine.learning::rekognition()

	picture <- paste0('usr',1:amount,'.png')

	ret_df <- rbindlist(lapply(picture, function(x){

	gicu <- svc$detect_labels(list(
	S3Object = list(
	Bucket = "cosmin-ceu-2020",
	Name = x
	)), MaxLabels = 5)

	df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
	return(df)

	}))

	out_df <- ret_df %>% group_by(Name) %>% summarise(USR_Confidence = mean(Confidence))
	return(out_df)

	}

	USR_picture_labels <- get_labels_usr(9)


	# Join USR & PSD dfs and analysis -----------------------------------------

	trial <- full_join(USR_picture_labels,PSD_picture_labels)

	write.csv(trial, "trial.txt")


	# Translate USR & Analyze article titles ------------------------------------------------

	library(aws.translate)
	USR_titles <- USR$title

	x <- 1
	USR_translations <- NULL
	for (i in 1:50) {
	USR_translations <- c(USR_translations,
	translate(USR_titles[x],
	from = "auto", to = "en"))
	x <- x + 1
	}

	USR$trans_title <- USR_translations

	write.csv(USR, "scraped_USR.csv", fileEncoding = 'UTF-8')

	# USR Sentiment Analysis

	library(aws.comprehend)
	x <- 1
	USR_sentiment <- NULL
	for (i in 1:50) {
	USR_sentiment <- rbind(USR_sentiment,
	detect_sentiment(USR_translations[x]))
	x <- x + 1
	}

	# Label instances in dataframe as USR for joining later
	USR_sentiment <- mutate(USR_sentiment, 'Political_Party'='USR')

	# Translate & Analyze PSD article titles --------------------------------------------

	library(aws.translate)
	PSD_titles <- PSD$title

	x <- 1
	PSD_translations <- NULL
	for (i in 1:48) {
	PSD_translations <- c(PSD_translations, translate(PSD_titles[x], from = "auto", to = "en"))
	x <- x + 1
	}

	PSD$trans_title <- PSD_translations

	# PSD Sentiment Analysis

	library(aws.comprehend)
	x <- 1
	PSD_sentiment <- NULL
	for (i in 1:48) {
	PSD_sentiment <- rbind(PSD_sentiment, detect_sentiment(PSD_translations[x]))
	x <- x + 1
	}
	PSD_sentiment <- mutate(PSD_sentiment, 'Political_Party'='PSD')


	# Merged sentiment analysis -----------------------------------------------

	PSD_USR_sentiment <-
	rbind(PSD_sentiment,USR_sentiment) %>%
	subset(select = -Index)

	write.csv(PSD_USR_sentiment, "sentiment.csv")

	library(ggplot2)
	library(ggthemes)
	PSD_USR_sentiment %>%
	keep(is.numeric) %>%
	gather() %>%
	ggplot(aes(value)) +
	facet_wrap(~key, scales = "free") +
	geom_histogram(fill= "black", col= "salmon")+
	theme_bw() +
	scale_fill_wsj()

	PSD_USR_sentiment$Political_Party <- as.factor(PSD_USR_sentiment$Political_Party)

	ggplot(PSD_USR_sentiment, aes(Positive, Negative, color = Political_Party))+
	geom_point(size = 5)+
	theme_bw() +
	scale_fill_wsj()

	PSD_sentiment_summary <- subset(PSD_sentiment, select = -c(Index, Political_Party, Sentiment)) %>%
	summarise(mean(Positive), mean(Neutral), mean(Negative), mean(Mixed)) %>%
	t() %>%
	rbind.data.frame() %>%
	add_column(c('Positive','Neutral','Negative','Mixed'))

	colnames(PSD_sentiment_summary) <- c('Weight','Sentiment')

	ggplot(PSD_sentiment_summary, aes(y = reorder(Sentiment, Weight), x = Weight))+
	geom_col(fill = "orangered4")+
	labs(y = 'Sentiment')+
	theme_bw() +
	scale_fill_wsj()