Skip to content

Instantly share code, notes, and snippets.

@Protonk
Created April 12, 2011 16:53
Scraped data on user reported Marijuana Prices
##Libraries needed for graphing and scraping. Reshape is loaded when ggplot2 is loaded
library(XML)
library(ggplot2);
#There are probably smarter ways to do this, but this works well enough
url.prefix<-"http://www.priceofweed.com/prices/United%20States/"
url.suffix<-".html"
state.uri<- paste(url.prefix,state.name,url.suffix,sep="");
#This function is quick and dirty (emphasis on dirty). It works as well
#as it does because priceofweed.com has very structured web pages. The first
#table on each page is the summary (trimmed mean) of reported prices.
#No attempt is made to check for consistency but as you can see we don't encounter any problems
weed.grab<- function(sleep=1) {
weed.name<- c("HQ.price", "MQ.price","LQ.price","HQ.n","MQ.n","LQ.n")
weed.mat<- matrix(0,50,6,dimnames=list(state.name,weed.name))
for (i in 1:50) {
state.int<- unlist(readHTMLTable(doc=state.uri[i] , as.data.frame=FALSE)[[1]])
weed.mat[i,]<- c(as.numeric(substring(state.int[6:8],2)),as.numeric(state.int[10:12]))
rm(state.int)
Sys.sleep(sleep)
}
weed.prices.df<- as.data.frame(weed.mat)
names(weed.prices.df)<- c("High Quality","Med. Quality","Low Quality","HQ.n","MQ.n","LQ.n")
return(weed.prices.df)
};
weed.gg<-melt(weed.prices.df[,1:3])
names(weed.gg)<-c("Quality","Price")
overall.den.plot<- qplot(Price,data=weed.gg,colour=Quality,geom="density") + opts(title= "Distribution of Marijuana Prices \n Across United States")
# I don't trust the quality reported and I suspect that medium and HQ can be binned into one result. So
#We get a weighted average of prices between med and HQ
subset.weed<- weed.prices.df[,c(1,4,2,5)]
weighted<-data.frame(row.names=state.name)
weighted$Average<- (subset.weed[,1]*subset.weed[,2] + subset.weed[,3]*subset.weed[,4])/(subset.weed[,2] + subset.weed[,4])
weighted$Reports<- (subset.weed[,2] + subset.weed[,4])
weighted$Region<-state.region;
state.hist.plot<- qplot(Average,data=weighted,geom="histogram", binwidth=20) + opts(title="Weighted Average of High/Low Quality")
region.dist.plot<- qplot(Average,data=weighted,geom="density") + facet_wrap( ~ Region) + opts(title="Price Distribution by Region")
#Your file path will vary of course. I didn't link to an existing online table because most of them have
#lots of silly extraneous stuff.
density<-read.csv(file="~/Downloads/statepopdens.csv",as.is=TRUE)
income<-read.csv(file="~/Downloads/med-income.csv",as.is=TRUE)
weighted$Density<-density$Density
weighted$Income<-income$Income;
#I chose log(Density) because density was a bit overdispersed. Normally I would
#take the log of income as well but because I am using median income the distribution
#is pretty tight
region.lm<- lm(Average ~ Income + log(Density) + Region , data=weighted)
#Do not do this at home. The sample size is of course not the inverse of variance!
region.wls<- lm(Average ~ Income + log(Density) + Region , data=weighted, weights=sqrt(Reports))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment