Protonk/weedprice.R

## weedprice.R
##Libraries needed for graphing and scraping.  Reshape is loaded when ggplot2 is loaded
library(XML)
library(ggplot2);

#There are probably smarter ways to do this, but this works well enough
url.prefix<-"http://www.priceofweed.com/prices/United%20States/"
url.suffix<-".html"
state.uri<- paste(url.prefix,state.name,url.suffix,sep="");


#This function is quick and dirty (emphasis on dirty).  It works as well
#as it does because priceofweed.com has very structured web pages.  The first
#table on each page is the summary (trimmed mean) of reported prices.
#No attempt is made to check for consistency but as you can see we don't encounter any problems
weed.grab<- function(sleep=1) {
	weed.name<- c("HQ.price", "MQ.price","LQ.price","HQ.n","MQ.n","LQ.n")
	weed.mat<- matrix(0,50,6,dimnames=list(state.name,weed.name))
	for (i in 1:50) {
		state.int<- unlist(readHTMLTable(doc=state.uri[i] , as.data.frame=FALSE)[[1]])
		weed.mat[i,]<- c(as.numeric(substring(state.int[6:8],2)),as.numeric(state.int[10:12]))
		rm(state.int)
		Sys.sleep(sleep)
		}
	weed.prices.df<- as.data.frame(weed.mat)
	names(weed.prices.df)<- c("High Quality","Med. Quality","Low Quality","HQ.n","MQ.n","LQ.n")
	return(weed.prices.df)
        };


weed.gg<-melt(weed.prices.df[,1:3])
names(weed.gg)<-c("Quality","Price")
overall.den.plot<- qplot(Price,data=weed.gg,colour=Quality,geom="density") + opts(title= "Distribution of Marijuana Prices \n Across United States")

# I don't trust the quality reported and I suspect that medium and HQ can be binned into one result.  So
#We get a weighted average of prices between med and HQ
subset.weed<- weed.prices.df[,c(1,4,2,5)]
weighted<-data.frame(row.names=state.name)
weighted$Average<- (subset.weed[,1]*subset.weed[,2] + subset.weed[,3]*subset.weed[,4])/(subset.weed[,2] + subset.weed[,4])
weighted$Reports<- (subset.weed[,2] + subset.weed[,4])
weighted$Region<-state.region;
state.hist.plot<- qplot(Average,data=weighted,geom="histogram", binwidth=20) + opts(title="Weighted Average of High/Low Quality")
region.dist.plot<- qplot(Average,data=weighted,geom="density") + facet_wrap( ~ Region) + opts(title="Price Distribution by Region")

#Your file path will vary of course.  I didn't link to an existing online table because most of them have
#lots of silly extraneous stuff.
density<-read.csv(file="~/Downloads/statepopdens.csv",as.is=TRUE)
income<-read.csv(file="~/Downloads/med-income.csv",as.is=TRUE)
weighted$Density<-density$Density
weighted$Income<-income$Income;

#I chose log(Density) because density was a bit overdispersed.  Normally I would
#take the log of income as well but because I am using median income the distribution
#is pretty tight
region.lm<- lm(Average ~ Income + log(Density) + Region , data=weighted)

#Do not do this at home.  The sample size is of course not the inverse of variance!
region.wls<- lm(Average ~ Income + log(Density) + Region , data=weighted, weights=sqrt(Reports))
	##Libraries needed for graphing and scraping. Reshape is loaded when ggplot2 is loaded
	library(XML)
	library(ggplot2);

	#There are probably smarter ways to do this, but this works well enough
	url.prefix<-"http://www.priceofweed.com/prices/United%20States/"
	url.suffix<-".html"
	state.uri<- paste(url.prefix,state.name,url.suffix,sep="");


	#This function is quick and dirty (emphasis on dirty). It works as well
	#as it does because priceofweed.com has very structured web pages. The first
	#table on each page is the summary (trimmed mean) of reported prices.
	#No attempt is made to check for consistency but as you can see we don't encounter any problems
	weed.grab<- function(sleep=1) {
	weed.name<- c("HQ.price", "MQ.price","LQ.price","HQ.n","MQ.n","LQ.n")
	weed.mat<- matrix(0,50,6,dimnames=list(state.name,weed.name))
	for (i in 1:50) {
	state.int<- unlist(readHTMLTable(doc=state.uri[i] , as.data.frame=FALSE)[[1]])
	weed.mat[i,]<- c(as.numeric(substring(state.int[6:8],2)),as.numeric(state.int[10:12]))
	rm(state.int)
	Sys.sleep(sleep)
	}
	weed.prices.df<- as.data.frame(weed.mat)
	names(weed.prices.df)<- c("High Quality","Med. Quality","Low Quality","HQ.n","MQ.n","LQ.n")
	return(weed.prices.df)
	};


	weed.gg<-melt(weed.prices.df[,1:3])
	names(weed.gg)<-c("Quality","Price")
	overall.den.plot<- qplot(Price,data=weed.gg,colour=Quality,geom="density") + opts(title= "Distribution of Marijuana Prices \n Across United States")

	# I don't trust the quality reported and I suspect that medium and HQ can be binned into one result. So
	#We get a weighted average of prices between med and HQ
	subset.weed<- weed.prices.df[,c(1,4,2,5)]
	weighted<-data.frame(row.names=state.name)
	weighted$Average<- (subset.weed[,1]subset.weed[,2] + subset.weed[,3]subset.weed[,4])/(subset.weed[,2] + subset.weed[,4])
	weighted$Reports<- (subset.weed[,2] + subset.weed[,4])
	weighted$Region<-state.region;
	state.hist.plot<- qplot(Average,data=weighted,geom="histogram", binwidth=20) + opts(title="Weighted Average of High/Low Quality")
	region.dist.plot<- qplot(Average,data=weighted,geom="density") + facet_wrap( ~ Region) + opts(title="Price Distribution by Region")

	#Your file path will vary of course. I didn't link to an existing online table because most of them have
	#lots of silly extraneous stuff.
	density<-read.csv(file="~/Downloads/statepopdens.csv",as.is=TRUE)
	income<-read.csv(file="~/Downloads/med-income.csv",as.is=TRUE)
	weighted$Density<-density$Density
	weighted$Income<-income$Income;

	#I chose log(Density) because density was a bit overdispersed. Normally I would
	#take the log of income as well but because I am using median income the distribution
	#is pretty tight
	region.lm<- lm(Average ~ Income + log(Density) + Region , data=weighted)

	#Do not do this at home. The sample size is of course not the inverse of variance!
	region.wls<- lm(Average ~ Income + log(Density) + Region , data=weighted, weights=sqrt(Reports))