ColCarroll/rReadTable.R

## rReadTable.R
tryAsInteger = function(node) {
  val = xmlValue(node)
  ans = as.integer(gsub(",", "", val))
  if(is.na(ans))
      val
  else
      ans
}

scrapeTable <- function(theurl = "http://en.wikipedia.org/wiki/List_of_highest_mountains"){
  require(XML)
  html <- htmlParse(theurl)
  tables <<- readHTMLTable(html, stringsAsFactors = FALSE, elFun = tryAsInteger)
  tables <- tables[sapply(tables, function(x) !is.null(x))]
  id <<- order(unlist(lapply(tables, function(t) dim(t)[1])))
  bigTable <<- tables[[tail(id,n=1)]]
}

fixMountainTable <- function(mtnTable){
  names(mtnTable) <- c("Rank","Mountain","Height.meters","Height.feet","Range","Coordinates","Prominence","ParentMountain","FirstAscent","AscentsBefore2004")
  sapply(mtnTable,mode)
  nums <- c("Rank","Height.meters","Height.feet","Prominence","FirstAscent")
  mtnTable[,nums] <- sapply(mtnTable[,nums],as.numeric)
  return(mtnTable)
}


meetingExamples <- function(){
  # Some useful commands from the meeting today.  Many of these are meant to be run
  # interactively, so just calling the function won't look that impressive.  If
  # you want to just run the function, you can call
  # > source('june18.R')
  # > meetingExamples()


  require(scales)
  require(ggplot2) #This has the diamonds dataset.  ??diamonds will find it in there

  head(diamonds) # Print the first 6 rows of data.  Use optional second argument to print more (see ?head)

  summary(diamonds) # Summarize the diamond data set

  diamonds$cut # Three ways to access the "cut" column of the diamonds data
  diamonds[,2]
  diamonds[,"cut"]

  four_cs <- diamonds[,c("carat","cut","color","clarity")] #creating a new data frame with only 4 columns
  four_cs <- diamonds[,c(1,2,3,4)] #same
  four_cs <- diamonds[,1:4] #same
  four_cs <- diamonds[,c(-5,-6,-7,-8,-9,-10)] # same.  Minus means "not this row"

  nsamples <- 10000
  train.data <<- diamonds[sample(1:nrow(diamonds),nsamples),] #select 5000 points randomly from diamonds

  # Below is an example of a plot you might produce, using some methods I showed and some I didn't.  Remember
  # that ?method.name is your friend.  Online documentation is also very good.

  p <<- ggplot(train.data, aes(x = carat, y = price)) #Base ggplot2 object.  I do not include color here because I do
                                                      # not want a line for each color
  p <- p + geom_point(alpha = I(0.2), aes(color = color), position = 'jitter') # Color aesthetic is added at the geom level
  p <- p + geom_smooth() #Fits a polynomial to data
  p <- p + facet_grid(cut~clarity) # Split the data by categorical variables
  p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2)) # Formatting the x-axis with log scaling
  p <- p + scale_y_continuous(name = "Price", label = dollar) # Using the "scales" package to format the y-axis
  p <- p + scale_size_continuous(name = "Depth (mm)") # You can also set legend attributes
  p <- p + ggtitle(sprintf("Carats vs price in a sample of %s diamonds",nsamples)) # String formatting in R
  p <- p + theme_minimal() # Some prebuilt themes to make your plots look great

  png("caratsvsprice.png",width = 1000, height = 1000) # This will save the plot
  print(p)
  dev.off()

  #Another plot will generate a density plot for the data.  It doesn't exactly fit, but it is an interesting
  #way to look at the data and shows off some other geoms.  This uses ..level.. as the height, but you could
  #equally well supply a z vector that recorded counts

  p <- ggplot(train.data, aes(x = carat, y = price))
  p <- p + geom_point(position = 'jitter',alpha = I(0.3)) # Underlying data set
  p <- p + geom_density2d(na.rm = TRUE,bins = 20, size = I(0.5), alpha = I(0.5), aes(color = ..level..)) #Minor contours
  p <- p + geom_density2d(na.rm = TRUE,bins = 4, size = I(2), aes(color = ..level..)) # Major contours
  p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2))
  p <- p + scale_y_log10(name = "Price", label = dollar)
  p <- p + scale_color_continuous(name = "Data point density")
  p <- p + ggtitle("Density of carats vs. price")

  png("contourplot.png",width = 1000, height = 1000) # This will save the plot
  print(p)
  dev.off()
}
	tryAsInteger = function(node) {
	val = xmlValue(node)
	ans = as.integer(gsub(",", "", val))
	if(is.na(ans))
	val
	else
	ans
	}

	scrapeTable <- function(theurl = "http://en.wikipedia.org/wiki/List_of_highest_mountains"){
	require(XML)
	html <- htmlParse(theurl)
	tables <<- readHTMLTable(html, stringsAsFactors = FALSE, elFun = tryAsInteger)
	tables <- tables[sapply(tables, function(x) !is.null(x))]
	id <<- order(unlist(lapply(tables, function(t) dim(t)[1])))
	bigTable <<- tables[[tail(id,n=1)]]
	}

	fixMountainTable <- function(mtnTable){
	names(mtnTable) <- c("Rank","Mountain","Height.meters","Height.feet","Range","Coordinates","Prominence","ParentMountain","FirstAscent","AscentsBefore2004")
	sapply(mtnTable,mode)
	nums <- c("Rank","Height.meters","Height.feet","Prominence","FirstAscent")
	mtnTable[,nums] <- sapply(mtnTable[,nums],as.numeric)
	return(mtnTable)
	}


	meetingExamples <- function(){
	# Some useful commands from the meeting today. Many of these are meant to be run
	# interactively, so just calling the function won't look that impressive. If
	# you want to just run the function, you can call
	# > source('june18.R')
	# > meetingExamples()


	require(scales)
	require(ggplot2) #This has the diamonds dataset. ??diamonds will find it in there

	head(diamonds) # Print the first 6 rows of data. Use optional second argument to print more (see ?head)

	summary(diamonds) # Summarize the diamond data set

	diamonds$cut # Three ways to access the "cut" column of the diamonds data
	diamonds[,2]
	diamonds[,"cut"]

	four_cs <- diamonds[,c("carat","cut","color","clarity")] #creating a new data frame with only 4 columns
	four_cs <- diamonds[,c(1,2,3,4)] #same
	four_cs <- diamonds[,1:4] #same
	four_cs <- diamonds[,c(-5,-6,-7,-8,-9,-10)] # same. Minus means "not this row"

	nsamples <- 10000
	train.data <<- diamonds[sample(1:nrow(diamonds),nsamples),] #select 5000 points randomly from diamonds

	# Below is an example of a plot you might produce, using some methods I showed and some I didn't. Remember
	# that ?method.name is your friend. Online documentation is also very good.

	p <<- ggplot(train.data, aes(x = carat, y = price)) #Base ggplot2 object. I do not include color here because I do
	# not want a line for each color
	p <- p + geom_point(alpha = I(0.2), aes(color = color), position = 'jitter') # Color aesthetic is added at the geom level
	p <- p + geom_smooth() #Fits a polynomial to data
	p <- p + facet_grid(cut~clarity) # Split the data by categorical variables
	p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2)) # Formatting the x-axis with log scaling
	p <- p + scale_y_continuous(name = "Price", label = dollar) # Using the "scales" package to format the y-axis
	p <- p + scale_size_continuous(name = "Depth (mm)") # You can also set legend attributes
	p <- p + ggtitle(sprintf("Carats vs price in a sample of %s diamonds",nsamples)) # String formatting in R
	p <- p + theme_minimal() # Some prebuilt themes to make your plots look great

	png("caratsvsprice.png",width = 1000, height = 1000) # This will save the plot
	print(p)
	dev.off()

	#Another plot will generate a density plot for the data. It doesn't exactly fit, but it is an interesting
	#way to look at the data and shows off some other geoms. This uses ..level.. as the height, but you could
	#equally well supply a z vector that recorded counts

	p <- ggplot(train.data, aes(x = carat, y = price))
	p <- p + geom_point(position = 'jitter',alpha = I(0.3)) # Underlying data set
	p <- p + geom_density2d(na.rm = TRUE,bins = 20, size = I(0.5), alpha = I(0.5), aes(color = ..level..)) #Minor contours
	p <- p + geom_density2d(na.rm = TRUE,bins = 4, size = I(2), aes(color = ..level..)) # Major contours
	p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2))
	p <- p + scale_y_log10(name = "Price", label = dollar)
	p <- p + scale_color_continuous(name = "Data point density")
	p <- p + ggtitle("Density of carats vs. price")

	png("contourplot.png",width = 1000, height = 1000) # This will save the plot
	print(p)
	dev.off()
	}