Skip to content

Instantly share code, notes, and snippets.

@flovv
Created October 25, 2016 05:31
Show Gist options
  • Save flovv/e2a8d1cf5fb0dd2abd0078887715dcce to your computer and use it in GitHub Desktop.
Save flovv/e2a8d1cf5fb0dd2abd0078887715dcce to your computer and use it in GitHub Desktop.
gather data from 3 news APIs
#devtools::install_github("ropengov/rtimes")
#devtools::install_github("chgrl/diezeit")
library(rtimes)
library(diezeit)
require(plyr)
require(lubridate)
require(ggthemes)
require(ggplot2)
###################
dataframeFromResult <- function(l) {
l1 <- lapply(l, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
keys <- unique(unlist(lapply(l1, names)))
l2 <- lapply(l1, '[', keys)
l3 <- lapply(l2, setNames, keys)
res <- data.frame(do.call(rbind, l3))
return(res)
}
############# NYT
getArticles <- function(q, page, begin_date = "19800101", end_date = '20161010',NYTIMES_AS_KEY = NYTIMES_AS_KEY){
res <- as_search(q=q, page=page, begin_date = begin_date, end_date = end_date,key = NYTIMES_AS_KEY)
df <- dataframeFromResult(res$data)
return(df)
}
howManyArticles <- function(q, begin_date = "19800101", end_date = '20161010',NYTIMES_AS_KEY = NYTIMES_AS_KEY){
res <- as_search(q=q, begin_date = begin_date, end_date = end_date,key = NYTIMES_AS_KEY)
return(res$meta)
}
############### ZEIT
getZeitResults <- function(term){
##limitSearch <- zeit_search("content", term, print=FALSE, limit=3)
unlimited <- zeit_search("content", term, print=FALSE, limit=1000)
dff <- dataframeFromResult(unlimited$matches)
dff$date <- as.Date(str_split_fixed(dff$release_date, "T",2)[,1])
return(dff)
}
############################# NYT data set
q='"digital transformation"'
pages <- 0:9
dat <- expand.grid(pages, q, NYTIMES_AS_KEY)
colnames(dat) <- c("page",'q','NYTIMES_AS_KEY')
allArticles <- mdply(dat, getArticles)
#################
allArticles$PubYear <- year(allArticles$pub_date)
dd <- ddply(allArticles, .(PubYear), summarise, N= length(pub_date))
ggplot(dd, aes(PubYear, N)) +geom_point() + geom_smooth() + theme_economist(base_size = 12) + ylab("N - Number of aricles per year in the NYT")+ xlab("Publication Year")
###################
### ZEIT
out <- getZeitResults("digitale Transformation")
out$PubYear <- year(out$date)
dd <- ddply(out, .(PubYear), summarise, N=length(date))
ggplot(dd, aes(PubYear, N)) + geom_point() +theme_economist(base_size = 12) +geom_point() + geom_smooth() + theme_economist() + ylab("N - Number of aricles per year in the ZEIT")+ xlab("Publication Year")
############# Hackernews API calls
require(httr)
require(reshape2)
res <- GET('http://hn.algolia.com/api/v1/search?query="digital%20transformation"&tags=story&hitsPerPage=50')
cont <- content(res, "parse")
df <- dataframeFromJSON(cont$hits)
df$points <- as.numeric(df$points)
df$created_at <- as.Date(df$created_at)
ggplot(df, aes(created_at, points))+ geom_smooth() +geom_point(size=1) +theme_economist()
df$year <- year(df$created_at)
df$num_comments <- as.numeric(df$num_comments)
dd <- ddply(df, .(year), summarise, Points = sum(points), Comments=sum(num_comments),Articles=length(year))
mm <- melt(dd, id.vars = "year")
ggplot(mm, aes(year, value))+geom_point(size=2, color="red") +geom_line()+theme_economist() + facet_grid(~variable) + ggtitle("Hackernews search for digital transformation")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment