Skip to content

Instantly share code, notes, and snippets.

@astatham
Created July 26, 2010 02:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save astatham/490116 to your computer and use it in GitHub Desktop.
Save astatham/490116 to your computer and use it in GitHub Desktop.
library(rjson)
options(scipen=10000)
if (file.exists("IAMA.db")) file.remove("IAMA.db")
system("sqlite3 IAMA.db < IAMA_scraper.schema;")
while (TRUE) {
bigTab <- NULL
after <- ""
baseURL <- "http://www.reddit.com/r/iama/.json?sort=new&limit=100"
scraped <- Sys.time()
cat(as.character(scraped), ": ", sep="")
while (!is.null(after)) {
cat(after, " ")
if (after!="") thisURL <- paste(baseURL, "&after=", after, sep="") else thisURL <- baseURL
temp <- suppressWarnings(readLines(thisURL))
#clean out the selfhtml which sometimes breaks rjson parser :(
temp <- gsub('"selftext_html": ".*?[^\\]"', '"selftext_html": ""', temp)
temp <- fromJSON(temp)$data
after <- temp$after
tempTab <- do.call(rbind, lapply(temp$children, function(x) {
as.data.frame(x$data[c("id", "url", "created_utc", "num_comments", "ups", "downs", "author", "title")], stringsAsFactors=FALSE)
}))
tempTab$title <- chartr("\n\r", " ", tempTab$title)
if (is.null(bigTab)) bigTab <- tempTab else bigTab <- rbind(bigTab, tempTab)
Sys.sleep(4)
}
cat("DONE\n")
fname <- paste("IAMA_scrape_", format(scraped, "%a_%b_%d_%X_%Y"), ".tsv", sep="")
bigTab$scraped <- as.integer(scraped)
write.table(bigTab, file=fname, quote=FALSE, sep="|", row.names=FALSE, col.names=FALSE)
system(paste("sqlite3 IAMA.db '.import",fname, "submissions'"))
Sys.sleep(3600)
}
DROP TABLE IF EXISTS submissions;
CREATE TABLE submissions (
id char NOT NULL,
url char NOT NULL,
created_utc integer NOT NULL,
num_comments integer NOT NULL,
ups integer NOT NULL,
downs integer NOT NULL,
author char NOT NULL,
title char NOT NULL,
scraped char NOT NULL
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment