Created
July 26, 2010 02:49
-
-
Save astatham/490116 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rjson) | |
options(scipen=10000) | |
if (file.exists("IAMA.db")) file.remove("IAMA.db") | |
system("sqlite3 IAMA.db < IAMA_scraper.schema;") | |
while (TRUE) { | |
bigTab <- NULL | |
after <- "" | |
baseURL <- "http://www.reddit.com/r/iama/.json?sort=new&limit=100" | |
scraped <- Sys.time() | |
cat(as.character(scraped), ": ", sep="") | |
while (!is.null(after)) { | |
cat(after, " ") | |
if (after!="") thisURL <- paste(baseURL, "&after=", after, sep="") else thisURL <- baseURL | |
temp <- suppressWarnings(readLines(thisURL)) | |
#clean out the selfhtml which sometimes breaks rjson parser :( | |
temp <- gsub('"selftext_html": ".*?[^\\]"', '"selftext_html": ""', temp) | |
temp <- fromJSON(temp)$data | |
after <- temp$after | |
tempTab <- do.call(rbind, lapply(temp$children, function(x) { | |
as.data.frame(x$data[c("id", "url", "created_utc", "num_comments", "ups", "downs", "author", "title")], stringsAsFactors=FALSE) | |
})) | |
tempTab$title <- chartr("\n\r", " ", tempTab$title) | |
if (is.null(bigTab)) bigTab <- tempTab else bigTab <- rbind(bigTab, tempTab) | |
Sys.sleep(4) | |
} | |
cat("DONE\n") | |
fname <- paste("IAMA_scrape_", format(scraped, "%a_%b_%d_%X_%Y"), ".tsv", sep="") | |
bigTab$scraped <- as.integer(scraped) | |
write.table(bigTab, file=fname, quote=FALSE, sep="|", row.names=FALSE, col.names=FALSE) | |
system(paste("sqlite3 IAMA.db '.import",fname, "submissions'")) | |
Sys.sleep(3600) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP TABLE IF EXISTS submissions; | |
CREATE TABLE submissions ( | |
id char NOT NULL, | |
url char NOT NULL, | |
created_utc integer NOT NULL, | |
num_comments integer NOT NULL, | |
ups integer NOT NULL, | |
downs integer NOT NULL, | |
author char NOT NULL, | |
title char NOT NULL, | |
scraped char NOT NULL | |
); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment