Created
April 12, 2011 16:53
Scraped data on user reported Marijuana Prices
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Libraries needed for graphing and scraping. Reshape is loaded when ggplot2 is loaded | |
library(XML) | |
library(ggplot2); | |
#There are probably smarter ways to do this, but this works well enough | |
url.prefix<-"http://www.priceofweed.com/prices/United%20States/" | |
url.suffix<-".html" | |
state.uri<- paste(url.prefix,state.name,url.suffix,sep=""); | |
#This function is quick and dirty (emphasis on dirty). It works as well | |
#as it does because priceofweed.com has very structured web pages. The first | |
#table on each page is the summary (trimmed mean) of reported prices. | |
#No attempt is made to check for consistency but as you can see we don't encounter any problems | |
weed.grab<- function(sleep=1) { | |
weed.name<- c("HQ.price", "MQ.price","LQ.price","HQ.n","MQ.n","LQ.n") | |
weed.mat<- matrix(0,50,6,dimnames=list(state.name,weed.name)) | |
for (i in 1:50) { | |
state.int<- unlist(readHTMLTable(doc=state.uri[i] , as.data.frame=FALSE)[[1]]) | |
weed.mat[i,]<- c(as.numeric(substring(state.int[6:8],2)),as.numeric(state.int[10:12])) | |
rm(state.int) | |
Sys.sleep(sleep) | |
} | |
weed.prices.df<- as.data.frame(weed.mat) | |
names(weed.prices.df)<- c("High Quality","Med. Quality","Low Quality","HQ.n","MQ.n","LQ.n") | |
return(weed.prices.df) | |
}; | |
weed.gg<-melt(weed.prices.df[,1:3]) | |
names(weed.gg)<-c("Quality","Price") | |
overall.den.plot<- qplot(Price,data=weed.gg,colour=Quality,geom="density") + opts(title= "Distribution of Marijuana Prices \n Across United States") | |
# I don't trust the quality reported and I suspect that medium and HQ can be binned into one result. So | |
#We get a weighted average of prices between med and HQ | |
subset.weed<- weed.prices.df[,c(1,4,2,5)] | |
weighted<-data.frame(row.names=state.name) | |
weighted$Average<- (subset.weed[,1]*subset.weed[,2] + subset.weed[,3]*subset.weed[,4])/(subset.weed[,2] + subset.weed[,4]) | |
weighted$Reports<- (subset.weed[,2] + subset.weed[,4]) | |
weighted$Region<-state.region; | |
state.hist.plot<- qplot(Average,data=weighted,geom="histogram", binwidth=20) + opts(title="Weighted Average of High/Low Quality") | |
region.dist.plot<- qplot(Average,data=weighted,geom="density") + facet_wrap( ~ Region) + opts(title="Price Distribution by Region") | |
#Your file path will vary of course. I didn't link to an existing online table because most of them have | |
#lots of silly extraneous stuff. | |
density<-read.csv(file="~/Downloads/statepopdens.csv",as.is=TRUE) | |
income<-read.csv(file="~/Downloads/med-income.csv",as.is=TRUE) | |
weighted$Density<-density$Density | |
weighted$Income<-income$Income; | |
#I chose log(Density) because density was a bit overdispersed. Normally I would | |
#take the log of income as well but because I am using median income the distribution | |
#is pretty tight | |
region.lm<- lm(Average ~ Income + log(Density) + Region , data=weighted) | |
#Do not do this at home. The sample size is of course not the inverse of variance! | |
region.wls<- lm(Average ~ Income + log(Density) + Region , data=weighted, weights=sqrt(Reports)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment