Last active
August 29, 2015 14:05
-
-
Save jeroen/de21582824833dbf5d69 to your computer and use it in GitHub Desktop.
Batch parsing JSON streams with jsonlite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The data stream is not JSON itself. It contains lines with JSON data. Therefore | |
# this example manually collapses it into a JSON array. The file is also quite | |
# large, so we would need to do some batch processing by splitting the file before | |
# feeding it to R. This example shows how to read first 20 records. | |
library(jsonlite) | |
gzdata <- gzcon(url("http://78.46.48.103/sample/hourly_14.json.gz")) | |
records <- readLines(gzdata, n = 20) | |
close(gzdata) | |
json <- paste0("[", paste0(records, collapse=","), "]") | |
weather <- fromJSON(json, validate=TRUE) | |
print(weather$city) | |
print(weather$data[[1]]) | |
# Process the entire data set in batches of 100 | |
# This takes a while :-) | |
gzstream <- gzcon(url("http://78.46.48.103/sample/hourly_14.json.gz", open="r")) | |
batches <- list(); | |
i <- 1; | |
while(length(records <- readLines(gzstream, n = 100))){ | |
message("Batch ", i, ": found ", length(records), " lines of json...") | |
json <- paste0("[", paste0(records, collapse=","), "]") | |
batches[[i]] <- fromJSON(json, validate=TRUE) | |
i <- i+1 | |
} | |
close(gzstream) | |
weather <- rbind.pages(batches) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment