Created
November 9, 2010 15:59
-
-
Save CerebralMastication/669278 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Thanks to @greghirson who totally knocked this out of the park! | |
# Based on his work, here's a simple script to pull monthly prices from the USDA NASS website. | |
library(RCurl) | |
library(XML) | |
x <- postForm(uri = "http://quickstats.nass.usda.gov/uuid/encode", | |
source_desc = "SURVEY", sector_desc="CROPS", | |
group_desc="FIELD CROPS", commodity_desc="CORN", | |
statisticcat_desc="PRICE RECEIVED", | |
short_desc="CORN, GRAIN - PRICE RECEIVED, MEASURED IN $ / BU", agg_level_desc="NATIONAL", | |
state_name = "US TOTAL", | |
year = "1977", | |
year = "1978", | |
year = "1979", | |
year = "1980", | |
year = "1981", | |
year = "1982", | |
year = "1983", | |
year = "1984", | |
year = "1985", | |
year = "1986", | |
year = "1987", | |
year = "1988", | |
year = "1989", | |
year = "1990", | |
year = "1991", | |
year = "1992", | |
year = "1993", | |
year = "1994", | |
year = "1995", | |
year = "1996", | |
year = "1997", | |
year = "1998", | |
year = "1999", | |
year = "2000", | |
year = "2001", | |
year = "2002", | |
year = "2003", | |
year = "2004", | |
year = "2005", | |
year = "2006", | |
year = "2007", | |
year = "2008", | |
year = "2009", | |
year = "2010", | |
freq_desc = "MONTHLY", | |
reference_period_desc= "JAN", | |
reference_period_desc= "FEB", | |
reference_period_desc= "MAR", | |
reference_period_desc= "APR", | |
reference_period_desc= "MAY", | |
reference_period_desc= "JUN", | |
reference_period_desc= "JUL", | |
reference_period_desc= "AUG", | |
reference_period_desc= "SEP", | |
reference_period_desc= "OCT", | |
reference_period_desc= "NOV", | |
reference_period_desc= "DEC", | |
breadcrumb = "reference_period_desc") | |
x2 <- gsub("\"", "", x) | |
y <- getURL(paste("http://quickstats.nass.usda.gov/results/", x2, sep = "")) | |
yparse <- htmlParse(y, asText=T) | |
path <- xpathSApply(yparse, "//a", xmlAttrs)[[1]]["href"] | |
datapath <- gsub(".url", "", strsplit(path, c("/"), fixed = TRUE)[[1]][3]) | |
usdaData <- read.csv(paste("http://quickstats.nass.usda.gov/data/spreadsheet/", datapath, ".csv", sep = "")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I turned the year and month lists into, well, lists. | |
# now it's a lot shorter | |
years <- 1970:2010 | |
yearList <- list(year=as.character(years)) | |
months <- c("JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC") | |
monthList <- list(reference_period_desc= months) | |
myParams <- list(source_desc = "SURVEY", sector_desc="CROPS", | |
group_desc="FIELD CROPS", commodity_desc="CORN", | |
statisticcat_desc="PRICE RECEIVED", | |
short_desc="CORN, GRAIN - PRICE RECEIVED, MEASURED IN $ / BU", agg_level_desc="NATIONAL", | |
state_name = "US TOTAL", | |
freq_desc = "MONTHLY", | |
reference_period_desc= "JAN", | |
breadcrumb = "reference_period_desc") | |
myParams <- c(myParams, yearList, monthList) | |
x <- postForm(uri = "http://quickstats.nass.usda.gov/uuid/encode", | |
.params = myParams) | |
x2 <- gsub("\"", "", x) | |
y <- getURL(paste("http://quickstats.nass.usda.gov/results/", x2, sep = "")) | |
yparse <- htmlParse(y, asText=T) | |
path <- xpathSApply(yparse, "//a", xmlAttrs)[[1]]["href"] | |
datapath <- gsub(".url", "", strsplit(path, c("/"), fixed = TRUE)[[1]][3]) | |
usdaData <- read.csv(paste("http://quickstats.nass.usda.gov/data/spreadsheet/", datapath, ".csv", sep = "")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was my original challenge which I accidentally deleted:
What I want to do is pull down the data from this site and parse the resulting data into a CSV.
Here's the URL: http://quickstats.nass.usda.gov/#A8B7B992-BA0C-3ADB-895B-425DC85DB425
The page before submitting should look like this: http://ow.ly/i/5jET/original
When you submit, it sends the following POST to USDA:
16:42:49.188[291ms][total 291ms] Status: 200[OK]
POST http://quickstats.nass.usda.gov/uuid/encode Load Flags[LOAD_BYPASS_CACHE LOAD_BACKGROUND ] Content Size[38] Mime Type[text/html]
Request Headers:
Host[quickstats.nass.usda.gov]
User-Agent[Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/10.10 (maverick) Firefox/3.6.12 GTB7.1]
Accept[text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8]
Accept-Language[en-us,en;q=0.5]
Accept-Encoding[gzip,deflate]
Accept-Charset[ISO-8859-1,utf-8;q=0.7,*;q=0.7]
Keep-Alive[115]
Connection[keep-alive]
Content-Type[application/x-www-form-urlencoded; charset=UTF-8]
X-Requested-With[XMLHttpRequest]
Referer[http://quickstats.nass.usda.gov/]
Content-Length[1181]
Cookie[filters={"group_desc":{"visible":true,"name":"group_desc","index":3,"selectedItems":["FIELD CROPS"],"label":"Group","sortOrder":"asc","busy":false},"agg_level_desc":{"visible":true,"name":"agg_level_desc","children":{"watershed_desc":{"index":false,"visible":true,"selectedItems":[],"name":"watershed_desc","sortOrder":"asc","label":"Watershed","children":false,"busy":false},"state_name":{"index":8,"visible":true,"selectedItems":["US TOTAL"],"name":"state_name","sortOrder":"asc","label":"State","children":{"asd_desc":{"index":false,"visible":true,"selectedItems":[],"name":"asd_desc","sortOrder":"asc","label":"Ag. District","children":false,"busy":false},"county_name":{"index":false,"visible":true,"selectedItems":[],"name":"county_name","sortOrder":"asc","label":"County","children":false,"busy":false},"zip_5":{"index":false,"visible":true,"selectedItems":[],"name":"zip_5","sortOrder":"asc","label":"Zip Code","children":false,"busy":false}},"busy":false},"region_desc":{"index":false,"visible":true,"selectedItems":[],"name":"region_desc","sortOrder":"asc","label":"Region","children":false,"busy":false}},"index":5,"selectedItems":["NATIONAL"],"sortOrder":"asc","label":"Locale","busy":false},"sector_desc":{"visible":true,"name":"sector_desc","index":1,"selectedItems":["CROPS"],"label":"Sector","sortOrder":"asc","busy":false},"commodity_desc":{"visible":true,"name":"commodity_desc","children":{"statisticcat_desc":{"index":6,"visible":true,"selectedItems":["PRICE RECEIVED"],"name":"statisticcat_desc","sortOrder":"asc","label":"Category","children":false,"busy":false},"short_desc":{"index":7,"visible":true,"selectedItems":["CORN, GRAIN - PRICE RECEIVED, MEASURED IN $ / BU"],"name":"short_desc","sortOrder":"asc","label":"Data Item","children":{"domain_desc":{"index":false,"visible":true,"selectedItems":[],"name":"domain_desc","sortOrder":"asc","label":"Domain","children":false,"busy":false}},"busy":false}},"index":4,"selectedItems":["CORN"],"sortOrder":"asc","label":"Commodity","busy":false},"source_desc":{"visible":true,"name":"source_desc","index":2,"selectedItems":["SURVEY"],"label":"Program","sortOrder":"asc","busy":false},"year":{"visible":true,"name":"year","children":{"freq_desc":{"index":10,"visible":true,"selectedItems":["MONTHLY"],"name":"freq_desc","sortOrder":"asc","label":"Frequency","children":{"reference_period_desc":{"index":11,"visible":true,"selectedItems":["JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"],"name":"reference_period_desc","sortOrder":"asc","label":"Refernece Period","children":false,"busy":false}},"busy":false}},"index":9,"selectedItems":["2010","2009","2008","2007","2006","2005","2004","2003","2002","2001","2000","1999","1998","1997","1996","1995","1994","1993","1992","1991","1990","1989","1988","1987","1986","1985","1984","1983","1982","1981","1980","1979","1978","1977"],"sortOrder":"desc","label":"Year","busy":false}}]
Pragma[no-cache]
Cache-Control[no-cache]
Post Data:
source_desc[SURVEY]
sector_desc[CROPS]
group_desc[FIELD%20CROPS]
commodity_desc[CORN]
statisticcat_desc[PRICE%20RECEIVED]
short_desc[CORN%2C%20GRAIN%20-%20PRICE%20RECEIVED%2C%20MEASURED%20IN%20%24%20%2F%20BU]
agg_level_desc[NATIONAL]
state_name[US%20TOTAL]
year[1977]
freq_desc[MONTHLY]
reference_period_desc[DEC]
breadcrumb[reference_period_desc]
Response Headers:
Date[Mon, 08 Nov 2010 22:41:23 GMT]
Server[Apache]
Content-Length[38]
Keep-Alive[timeout=5, max=99]
Connection[Keep-Alive]
Content-Type[text/html; charset=utf-8]
What I want to capture is the resulting table which looks like this: http://ow.ly/i/5jF0/original
What I want is the CSV that you would get if you clicked in the "spreadsheet" link in the upper right hand corner.