Skip to content

Instantly share code, notes, and snippets.

@flovv
Last active September 25, 2020 05:10
Show Gist options
  • Save flovv/7fda29822024d5c07b4043fd6195c544 to your computer and use it in GitHub Desktop.
Save flovv/7fda29822024d5c07b4043fd6195c544 to your computer and use it in GitHub Desktop.
Small script to scrape r-bloggers social data by author
require(httr)
require(stringr)
require(rvest)
getNumberOfPages <- function(url){
rb <- read_html(url)
str <- rb %>%
html_nodes(".pages") %>% html_text()%>%
as.character()
return(str_split_fixed(str, pattern="of ", n=2)[,2])
}
###################
getLinksForPage <- function(url){
rb <- read_html(url)
rb %>%
html_nodes("p a") %>%
html_attr("href") %>%
as.character()
}
getAllLinks <- function(baseURL, numPages){
re <- c()
for(i in 0:numPages){
url <- paste0(baseURL, i,"/")
re <- c(re, getLinksForPage(url))
}
return(re)
}
########### get
getEvaluation <- function(url2){
ret <- GET(paste0("http://graph.facebook.com/?id=",url2))
FB <- content(ret)
ret2 <- GET(paste0("http://www.linkedin.com/countserv/count/share?url=",url2,"&format=json"))
linkedIn <- content(ret2)
## read the orginal link to website .. so matching with GA data is painless.
rb <- read_html(url2)
orgLink <- rb %>%
html_nodes(".social4i+ div") %>%
as.character()
pubDate <- rb %>%
html_nodes(".date") %>%
html_text() %>%as.character()
orgLink <- str_split_fixed(str_split_fixed(orgLink, 'href=\"', 2)[,2], "\">", 2)[,1]
data.frame(comments=FB$share$comment_count, shares=FB$share$share_count, linkedIn1=linkedIn$count, linkedIn2=linkedIn$fCntPlusOne, url=url2, orgLink = orgLink, pubDate=pubDate)
}
#############
baseURL <- "https://www.r-bloggers.com/author/florian-teschner/page/"
url <- "https://www.r-bloggers.com/author/florian-teschner/page/0/"
numPages <- as.numeric(getNumberOfPages(url))
links <- getAllLinks(baseURL, numPages)
remove("dff")
for(i in links){
df <- getEvaluation(i)
if(exists("dff")){
dff <- rbind(dff, df)
}
else{
dff <- df
}
}
library(googleAnalyticsR)
googleAnalyticsR::ga_auth()
account_list <- google_analytics_account_list()
ga_id <- xxx # your GA_id here
metrics <- googleAnalyticsR::google_analytics_meta()
###########
ga_df <- google_analytics_4(ga_id,
date_range = c("2015-12-01","2017-11-14"),
metrics = c("ga:pageviews"),
dimensions = c("ga:landingPagePath"),
anti_sample = TRUE)
########################
require(lubridate)
require(corrplot)
dff$url <- str_replace_all(dff$url, "https://www.r-bloggers.com/", "")
dff$url <- str_replace_all(dff$url, "/", "")
dff$orgLink <- str_replace_all(dff$orgLink, "https://flovv.github.io/www.nypon.de/", "")
dff$orgLink <- str_replace_all(dff$orgLink, "https://flovv.github.io/", "")
dff$orgLink <- str_replace_all(dff$orgLink, "https://flovv.github.com/", "")
dff$orgLink <- paste0("/", dff$orgLink)
dff$pubDate <- str_replace_all(dff$pubDate, ",", "")
dff$Date <- mdy(dff$pubDate)
dff <- merge(dff, ga_df, by.x="orgLink", "landingPagePath")
saveRDS(dff, "contentEvaluation.rds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment