Skip to content

Instantly share code, notes, and snippets.

@cosmin-ticu
Created December 7, 2020 19:52
Show Gist options
  • Save cosmin-ticu/f3c6328aadb1223f8a14cebd496f0f72 to your computer and use it in GitHub Desktop.
Save cosmin-ticu/f3c6328aadb1223f8a14cebd496f0f72 to your computer and use it in GitHub Desktop.
# Set up your R w/ AWS ----------------------------------------------------
keyTable <- read.csv("accessKeys.csv", header = T)
# accessKeys.csv == the CSV downloaded from AWS containing your Access & Secret keys
AWS_ACCESS_KEY_ID <- as.character(keyTable$Access.key.ID)
AWS_SECRET_ACCESS_KEY <- as.character(keyTable$Secret.access.key)
#activate
Sys.setenv("AWS_ACCESS_KEY_ID" = AWS_ACCESS_KEY_ID,
"AWS_SECRET_ACCESS_KEY" = AWS_SECRET_ACCESS_KEY,
"AWS_DEFAULT_REGION" = "eu-west-1")
library(rvest)
library(data.table)
library(dplyr)
library(tidyverse)
# get_one_page_from_usr <- function(my_url_usr) {
#
# t <- read_html(my_url_usr)
#
# boxes <- t %>% html_nodes('.sg_news')
#
# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
# box_dfs <- lapply(boxes, function(x){
#
# tl <- list()
#
# tl[['timestamp']] <- x %>% html_nodes('.date') %>% html_text()
# tl[['link_pic']] <- x %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
# return(tl)
#
# })
#
# df <- rbindlist(box_dfs, fill = T)
# return(img_url)
#
# }
# download.file(paste0(gigi$link_pic[1]), destfile = "test.png", mode = 'wb')
#
# counter <- 1
# for (i in gigi) {
# download.file(gigi[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
# counter <- counter + 1
# }
# working image downloader ------------------------------------------------------
# my_url_usr <- 'https://www.usrplus.ro/noutati?page=1'
# get_pic_links_from_usr <- function(my_url_usr) {
#
# t <- read_html(my_url_usr)
#
# boxes <- t %>% html_nodes('.sg_news')
#
# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src')
#
# counter <- 1
# for (i in img_url) {
# download.file(img_url[counter], destfile = paste0("test", counter,'.png'), mode = 'wb')
# counter <- counter + 1
# }
#
# return(img_url)
#
# }
#
# test <- get_pic_links_from_usr(my_url_usr)
# USR iterative text & content scraper -----------------------------------------
get_article_from_usr <- function(pages_to_download) {
count <- 1
counter <- 1
links_to_get <- paste0('https://www.usrplus.ro/noutati?page=',
1:pages_to_download)
ret_df <- rbindlist(lapply(links_to_get, function(my_url_usr){
t <- read_html(my_url_usr)
boxes <- t %>% html_nodes('.sg_news')
img_url <- boxes %>% html_nodes('.imgWrap') %>%
html_nodes('a') %>%
html_nodes('img') %>%
html_attr('src')
img_url[img_url==''] <- NA
for (i in img_url) {
if(!is.na(img_url[counter])){
download.file(img_url[counter],
destfile = paste0("usr/usr",
count,'.png'),
mode = 'wb')
}
counter <- counter + 1
count <<- count + 1
}
box_dfs <- lapply(boxes, function(x){
tl <- list()
tl[['timestamp']] <- x %>% html_nodes('.date') %>%
html_text() %>%
trimws()
tl[['title']] <- x %>% html_nodes('h5') %>%
html_nodes('a') %>%
html_text()
tl[['link']] <- paste0('https://www.usrplus.ro', x %>%
html_nodes('h5') %>%
html_nodes('a') %>%
html_attr('href'))
return(tl)
})
df <- rbindlist(box_dfs, fill = T)
return(df)
}))
return(ret_df)
}
USR <- get_article_from_usr(5)
# PSD working iterative scraper ---------------------------------------
# my_url_psd <- 'https://www.psd.ro/stiri/page/4'
get_article_from_psd <- function(pages_to_download) {
count <- 1
counter <- 1
links_to_get <- paste0('https://www.psd.ro/stiri/page/',1:pages_to_download)
ret_df <- rbindlist(lapply(links_to_get, function(my_url_psd){
t <- read_html(my_url_psd)
boxes <- t %>% html_nodes('.col-md-4')
img_url <- as.character(boxes %>% html_nodes('.article__featured-image') %>% html_attr('src'))
img_url[img_url==''] <- NA
for (i in img_url) {
if(!is.na(img_url[counter])){
download.file(img_url[counter], destfile = paste0("psd/psd", count,'.png'), mode = 'wb')
}
counter <- counter + 1
count <<- count + 1
}
box_dfs <- lapply(boxes, function(x){
tl <- list()
tl[['timestamp']] <- x %>% html_nodes('.article__date') %>% html_text() %>% trimws()
tl[['title']] <- x %>% html_nodes('.article__title') %>% html_text()
tl[['link']] <- x %>% html_nodes('.article-link') %>% html_attr('href')
return(tl)
})
df <- rbindlist(box_dfs, fill = T)
return(df)
}))
return(ret_df)
}
PSD <- get_article_from_psd(4)
# Load up into S3 ---------------------------------------------------------
library(aws.s3)
s3sync(path = 'usr',
bucket = 'cosmin-ceu-2020',
direction = 'upload',
verbose = T,
recursive = T)
s3sync(path = 'psd',
bucket = 'cosmin-ceu-2020',
direction = 'upload',
verbose = T,
recursive = T)
# Amazon Rekognition PSD ------------------------------------------------
library(paws.machine.learning)
get_labels_psd <- function(amount = 5) {
svc <- paws.machine.learning::rekognition()
picture <- paste0('psd',1:amount,'.png')
ret_df <- rbindlist(lapply(picture, function(x){
gicu <- svc$detect_labels(list(
S3Object = list(
Bucket = "cosmin-ceu-2020",
Name = x
)), MaxLabels = 5)
df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
return(df)
}))
out_df <- ret_df %>% group_by(Name) %>% summarise(PSD_Confidence = mean(Confidence))
return(out_df)
}
PSD_picture_labels <- get_labels_psd(9)
# keys <- colnames(test)[!grepl('Confidence',colnames(test))]
# X <- as.data.table(test)
# X[,list(mm= mean(Confidence)),keys]
# Amazon Rekognition USR ---------------------------------------------------
get_labels_usr <- function(amount = 5) {
svc <- paws.machine.learning::rekognition()
picture <- paste0('usr',1:amount,'.png')
ret_df <- rbindlist(lapply(picture, function(x){
gicu <- svc$detect_labels(list(
S3Object = list(
Bucket = "cosmin-ceu-2020",
Name = x
)), MaxLabels = 5)
df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique()
return(df)
}))
out_df <- ret_df %>% group_by(Name) %>% summarise(USR_Confidence = mean(Confidence))
return(out_df)
}
USR_picture_labels <- get_labels_usr(9)
# Join USR & PSD dfs and analysis -----------------------------------------
trial <- full_join(USR_picture_labels,PSD_picture_labels)
write.csv(trial, "trial.txt")
# Translate USR & Analyze article titles ------------------------------------------------
library(aws.translate)
USR_titles <- USR$title
x <- 1
USR_translations <- NULL
for (i in 1:50) {
USR_translations <- c(USR_translations,
translate(USR_titles[x],
from = "auto", to = "en"))
x <- x + 1
}
USR$trans_title <- USR_translations
write.csv(USR, "scraped_USR.csv", fileEncoding = 'UTF-8')
# USR Sentiment Analysis
library(aws.comprehend)
x <- 1
USR_sentiment <- NULL
for (i in 1:50) {
USR_sentiment <- rbind(USR_sentiment,
detect_sentiment(USR_translations[x]))
x <- x + 1
}
# Label instances in dataframe as USR for joining later
USR_sentiment <- mutate(USR_sentiment, 'Political_Party'='USR')
# Translate & Analyze PSD article titles --------------------------------------------
library(aws.translate)
PSD_titles <- PSD$title
x <- 1
PSD_translations <- NULL
for (i in 1:48) {
PSD_translations <- c(PSD_translations, translate(PSD_titles[x], from = "auto", to = "en"))
x <- x + 1
}
PSD$trans_title <- PSD_translations
# PSD Sentiment Analysis
library(aws.comprehend)
x <- 1
PSD_sentiment <- NULL
for (i in 1:48) {
PSD_sentiment <- rbind(PSD_sentiment, detect_sentiment(PSD_translations[x]))
x <- x + 1
}
PSD_sentiment <- mutate(PSD_sentiment, 'Political_Party'='PSD')
# Merged sentiment analysis -----------------------------------------------
PSD_USR_sentiment <-
rbind(PSD_sentiment,USR_sentiment) %>%
subset(select = -Index)
write.csv(PSD_USR_sentiment, "sentiment.csv")
library(ggplot2)
library(ggthemes)
PSD_USR_sentiment %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_histogram(fill= "black", col= "salmon")+
theme_bw() +
scale_fill_wsj()
PSD_USR_sentiment$Political_Party <- as.factor(PSD_USR_sentiment$Political_Party)
ggplot(PSD_USR_sentiment, aes(Positive, Negative, color = Political_Party))+
geom_point(size = 5)+
theme_bw() +
scale_fill_wsj()
PSD_sentiment_summary <- subset(PSD_sentiment, select = -c(Index, Political_Party, Sentiment)) %>%
summarise(mean(Positive), mean(Neutral), mean(Negative), mean(Mixed)) %>%
t() %>%
rbind.data.frame() %>%
add_column(c('Positive','Neutral','Negative','Mixed'))
colnames(PSD_sentiment_summary) <- c('Weight','Sentiment')
ggplot(PSD_sentiment_summary, aes(y = reorder(Sentiment, Weight), x = Weight))+
geom_col(fill = "orangered4")+
labs(y = 'Sentiment')+
theme_bw() +
scale_fill_wsj()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment