-
-
Save cosmin-ticu/f3c6328aadb1223f8a14cebd496f0f72 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Set up your R w/ AWS ---------------------------------------------------- | |
keyTable <- read.csv("accessKeys.csv", header = T) | |
# accessKeys.csv == the CSV downloaded from AWS containing your Access & Secret keys | |
AWS_ACCESS_KEY_ID <- as.character(keyTable$Access.key.ID) | |
AWS_SECRET_ACCESS_KEY <- as.character(keyTable$Secret.access.key) | |
#activate | |
Sys.setenv("AWS_ACCESS_KEY_ID" = AWS_ACCESS_KEY_ID, | |
"AWS_SECRET_ACCESS_KEY" = AWS_SECRET_ACCESS_KEY, | |
"AWS_DEFAULT_REGION" = "eu-west-1") | |
library(rvest) | |
library(data.table) | |
library(dplyr) | |
library(tidyverse) | |
# get_one_page_from_usr <- function(my_url_usr) { | |
# | |
# t <- read_html(my_url_usr) | |
# | |
# boxes <- t %>% html_nodes('.sg_news') | |
# | |
# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src') | |
# | |
# box_dfs <- lapply(boxes, function(x){ | |
# | |
# tl <- list() | |
# | |
# tl[['timestamp']] <- x %>% html_nodes('.date') %>% html_text() | |
# tl[['link_pic']] <- x %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src') | |
# | |
# return(tl) | |
# | |
# }) | |
# | |
# df <- rbindlist(box_dfs, fill = T) | |
# return(img_url) | |
# | |
# } | |
# download.file(paste0(gigi$link_pic[1]), destfile = "test.png", mode = 'wb') | |
# | |
# counter <- 1 | |
# for (i in gigi) { | |
# download.file(gigi[counter], destfile = paste0("test", counter,'.png'), mode = 'wb') | |
# counter <- counter + 1 | |
# } | |
# working image downloader ------------------------------------------------------ | |
# my_url_usr <- 'https://www.usrplus.ro/noutati?page=1' | |
# get_pic_links_from_usr <- function(my_url_usr) { | |
# | |
# t <- read_html(my_url_usr) | |
# | |
# boxes <- t %>% html_nodes('.sg_news') | |
# | |
# img_url <- boxes %>% html_nodes('.imgWrap') %>% html_nodes('a') %>% html_nodes('img') %>% html_attr('src') | |
# | |
# counter <- 1 | |
# for (i in img_url) { | |
# download.file(img_url[counter], destfile = paste0("test", counter,'.png'), mode = 'wb') | |
# counter <- counter + 1 | |
# } | |
# | |
# return(img_url) | |
# | |
# } | |
# | |
# test <- get_pic_links_from_usr(my_url_usr) | |
# USR iterative text & content scraper ----------------------------------------- | |
get_article_from_usr <- function(pages_to_download) { | |
count <- 1 | |
counter <- 1 | |
links_to_get <- paste0('https://www.usrplus.ro/noutati?page=', | |
1:pages_to_download) | |
ret_df <- rbindlist(lapply(links_to_get, function(my_url_usr){ | |
t <- read_html(my_url_usr) | |
boxes <- t %>% html_nodes('.sg_news') | |
img_url <- boxes %>% html_nodes('.imgWrap') %>% | |
html_nodes('a') %>% | |
html_nodes('img') %>% | |
html_attr('src') | |
img_url[img_url==''] <- NA | |
for (i in img_url) { | |
if(!is.na(img_url[counter])){ | |
download.file(img_url[counter], | |
destfile = paste0("usr/usr", | |
count,'.png'), | |
mode = 'wb') | |
} | |
counter <- counter + 1 | |
count <<- count + 1 | |
} | |
box_dfs <- lapply(boxes, function(x){ | |
tl <- list() | |
tl[['timestamp']] <- x %>% html_nodes('.date') %>% | |
html_text() %>% | |
trimws() | |
tl[['title']] <- x %>% html_nodes('h5') %>% | |
html_nodes('a') %>% | |
html_text() | |
tl[['link']] <- paste0('https://www.usrplus.ro', x %>% | |
html_nodes('h5') %>% | |
html_nodes('a') %>% | |
html_attr('href')) | |
return(tl) | |
}) | |
df <- rbindlist(box_dfs, fill = T) | |
return(df) | |
})) | |
return(ret_df) | |
} | |
USR <- get_article_from_usr(5) | |
# PSD working iterative scraper --------------------------------------- | |
# my_url_psd <- 'https://www.psd.ro/stiri/page/4' | |
get_article_from_psd <- function(pages_to_download) { | |
count <- 1 | |
counter <- 1 | |
links_to_get <- paste0('https://www.psd.ro/stiri/page/',1:pages_to_download) | |
ret_df <- rbindlist(lapply(links_to_get, function(my_url_psd){ | |
t <- read_html(my_url_psd) | |
boxes <- t %>% html_nodes('.col-md-4') | |
img_url <- as.character(boxes %>% html_nodes('.article__featured-image') %>% html_attr('src')) | |
img_url[img_url==''] <- NA | |
for (i in img_url) { | |
if(!is.na(img_url[counter])){ | |
download.file(img_url[counter], destfile = paste0("psd/psd", count,'.png'), mode = 'wb') | |
} | |
counter <- counter + 1 | |
count <<- count + 1 | |
} | |
box_dfs <- lapply(boxes, function(x){ | |
tl <- list() | |
tl[['timestamp']] <- x %>% html_nodes('.article__date') %>% html_text() %>% trimws() | |
tl[['title']] <- x %>% html_nodes('.article__title') %>% html_text() | |
tl[['link']] <- x %>% html_nodes('.article-link') %>% html_attr('href') | |
return(tl) | |
}) | |
df <- rbindlist(box_dfs, fill = T) | |
return(df) | |
})) | |
return(ret_df) | |
} | |
PSD <- get_article_from_psd(4) | |
# Load up into S3 --------------------------------------------------------- | |
library(aws.s3) | |
s3sync(path = 'usr', | |
bucket = 'cosmin-ceu-2020', | |
direction = 'upload', | |
verbose = T, | |
recursive = T) | |
s3sync(path = 'psd', | |
bucket = 'cosmin-ceu-2020', | |
direction = 'upload', | |
verbose = T, | |
recursive = T) | |
# Amazon Rekognition PSD ------------------------------------------------ | |
library(paws.machine.learning) | |
get_labels_psd <- function(amount = 5) { | |
svc <- paws.machine.learning::rekognition() | |
picture <- paste0('psd',1:amount,'.png') | |
ret_df <- rbindlist(lapply(picture, function(x){ | |
gicu <- svc$detect_labels(list( | |
S3Object = list( | |
Bucket = "cosmin-ceu-2020", | |
Name = x | |
)), MaxLabels = 5) | |
df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique() | |
return(df) | |
})) | |
out_df <- ret_df %>% group_by(Name) %>% summarise(PSD_Confidence = mean(Confidence)) | |
return(out_df) | |
} | |
PSD_picture_labels <- get_labels_psd(9) | |
# keys <- colnames(test)[!grepl('Confidence',colnames(test))] | |
# X <- as.data.table(test) | |
# X[,list(mm= mean(Confidence)),keys] | |
# Amazon Rekognition USR --------------------------------------------------- | |
get_labels_usr <- function(amount = 5) { | |
svc <- paws.machine.learning::rekognition() | |
picture <- paste0('usr',1:amount,'.png') | |
ret_df <- rbindlist(lapply(picture, function(x){ | |
gicu <- svc$detect_labels(list( | |
S3Object = list( | |
Bucket = "cosmin-ceu-2020", | |
Name = x | |
)), MaxLabels = 5) | |
df <- rbindlist(gicu$Labels, fill = T) %>% subset(select = c('Name', 'Confidence')) %>% unique() | |
return(df) | |
})) | |
out_df <- ret_df %>% group_by(Name) %>% summarise(USR_Confidence = mean(Confidence)) | |
return(out_df) | |
} | |
USR_picture_labels <- get_labels_usr(9) | |
# Join USR & PSD dfs and analysis ----------------------------------------- | |
trial <- full_join(USR_picture_labels,PSD_picture_labels) | |
write.csv(trial, "trial.txt") | |
# Translate USR & Analyze article titles ------------------------------------------------ | |
library(aws.translate) | |
USR_titles <- USR$title | |
x <- 1 | |
USR_translations <- NULL | |
for (i in 1:50) { | |
USR_translations <- c(USR_translations, | |
translate(USR_titles[x], | |
from = "auto", to = "en")) | |
x <- x + 1 | |
} | |
USR$trans_title <- USR_translations | |
write.csv(USR, "scraped_USR.csv", fileEncoding = 'UTF-8') | |
# USR Sentiment Analysis | |
library(aws.comprehend) | |
x <- 1 | |
USR_sentiment <- NULL | |
for (i in 1:50) { | |
USR_sentiment <- rbind(USR_sentiment, | |
detect_sentiment(USR_translations[x])) | |
x <- x + 1 | |
} | |
# Label instances in dataframe as USR for joining later | |
USR_sentiment <- mutate(USR_sentiment, 'Political_Party'='USR') | |
# Translate & Analyze PSD article titles -------------------------------------------- | |
library(aws.translate) | |
PSD_titles <- PSD$title | |
x <- 1 | |
PSD_translations <- NULL | |
for (i in 1:48) { | |
PSD_translations <- c(PSD_translations, translate(PSD_titles[x], from = "auto", to = "en")) | |
x <- x + 1 | |
} | |
PSD$trans_title <- PSD_translations | |
# PSD Sentiment Analysis | |
library(aws.comprehend) | |
x <- 1 | |
PSD_sentiment <- NULL | |
for (i in 1:48) { | |
PSD_sentiment <- rbind(PSD_sentiment, detect_sentiment(PSD_translations[x])) | |
x <- x + 1 | |
} | |
PSD_sentiment <- mutate(PSD_sentiment, 'Political_Party'='PSD') | |
# Merged sentiment analysis ----------------------------------------------- | |
PSD_USR_sentiment <- | |
rbind(PSD_sentiment,USR_sentiment) %>% | |
subset(select = -Index) | |
write.csv(PSD_USR_sentiment, "sentiment.csv") | |
library(ggplot2) | |
library(ggthemes) | |
PSD_USR_sentiment %>% | |
keep(is.numeric) %>% | |
gather() %>% | |
ggplot(aes(value)) + | |
facet_wrap(~key, scales = "free") + | |
geom_histogram(fill= "black", col= "salmon")+ | |
theme_bw() + | |
scale_fill_wsj() | |
PSD_USR_sentiment$Political_Party <- as.factor(PSD_USR_sentiment$Political_Party) | |
ggplot(PSD_USR_sentiment, aes(Positive, Negative, color = Political_Party))+ | |
geom_point(size = 5)+ | |
theme_bw() + | |
scale_fill_wsj() | |
PSD_sentiment_summary <- subset(PSD_sentiment, select = -c(Index, Political_Party, Sentiment)) %>% | |
summarise(mean(Positive), mean(Neutral), mean(Negative), mean(Mixed)) %>% | |
t() %>% | |
rbind.data.frame() %>% | |
add_column(c('Positive','Neutral','Negative','Mixed')) | |
colnames(PSD_sentiment_summary) <- c('Weight','Sentiment') | |
ggplot(PSD_sentiment_summary, aes(y = reorder(Sentiment, Weight), x = Weight))+ | |
geom_col(fill = "orangered4")+ | |
labs(y = 'Sentiment')+ | |
theme_bw() + | |
scale_fill_wsj() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment