Skip to content

Instantly share code, notes, and snippets.

@ColCarroll
Last active December 18, 2015 15:58
Show Gist options
  • Save ColCarroll/5807955 to your computer and use it in GitHub Desktop.
Save ColCarroll/5807955 to your computer and use it in GitHub Desktop.
A demonstration for pulling data from a wikipedia table and cleaning the specific highest mountain table.
tryAsInteger = function(node) {
val = xmlValue(node)
ans = as.integer(gsub(",", "", val))
if(is.na(ans))
val
else
ans
}
scrapeTable <- function(theurl = "http://en.wikipedia.org/wiki/List_of_highest_mountains"){
require(XML)
html <- htmlParse(theurl)
tables <<- readHTMLTable(html, stringsAsFactors = FALSE, elFun = tryAsInteger)
tables <- tables[sapply(tables, function(x) !is.null(x))]
id <<- order(unlist(lapply(tables, function(t) dim(t)[1])))
bigTable <<- tables[[tail(id,n=1)]]
}
fixMountainTable <- function(mtnTable){
names(mtnTable) <- c("Rank","Mountain","Height.meters","Height.feet","Range","Coordinates","Prominence","ParentMountain","FirstAscent","AscentsBefore2004")
sapply(mtnTable,mode)
nums <- c("Rank","Height.meters","Height.feet","Prominence","FirstAscent")
mtnTable[,nums] <- sapply(mtnTable[,nums],as.numeric)
return(mtnTable)
}
meetingExamples <- function(){
# Some useful commands from the meeting today. Many of these are meant to be run
# interactively, so just calling the function won't look that impressive. If
# you want to just run the function, you can call
# > source('june18.R')
# > meetingExamples()
require(scales)
require(ggplot2) #This has the diamonds dataset. ??diamonds will find it in there
head(diamonds) # Print the first 6 rows of data. Use optional second argument to print more (see ?head)
summary(diamonds) # Summarize the diamond data set
diamonds$cut # Three ways to access the "cut" column of the diamonds data
diamonds[,2]
diamonds[,"cut"]
four_cs <- diamonds[,c("carat","cut","color","clarity")] #creating a new data frame with only 4 columns
four_cs <- diamonds[,c(1,2,3,4)] #same
four_cs <- diamonds[,1:4] #same
four_cs <- diamonds[,c(-5,-6,-7,-8,-9,-10)] # same. Minus means "not this row"
nsamples <- 10000
train.data <<- diamonds[sample(1:nrow(diamonds),nsamples),] #select 5000 points randomly from diamonds
# Below is an example of a plot you might produce, using some methods I showed and some I didn't. Remember
# that ?method.name is your friend. Online documentation is also very good.
p <<- ggplot(train.data, aes(x = carat, y = price)) #Base ggplot2 object. I do not include color here because I do
# not want a line for each color
p <- p + geom_point(alpha = I(0.2), aes(color = color), position = 'jitter') # Color aesthetic is added at the geom level
p <- p + geom_smooth() #Fits a polynomial to data
p <- p + facet_grid(cut~clarity) # Split the data by categorical variables
p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2)) # Formatting the x-axis with log scaling
p <- p + scale_y_continuous(name = "Price", label = dollar) # Using the "scales" package to format the y-axis
p <- p + scale_size_continuous(name = "Depth (mm)") # You can also set legend attributes
p <- p + ggtitle(sprintf("Carats vs price in a sample of %s diamonds",nsamples)) # String formatting in R
p <- p + theme_minimal() # Some prebuilt themes to make your plots look great
png("caratsvsprice.png",width = 1000, height = 1000) # This will save the plot
print(p)
dev.off()
#Another plot will generate a density plot for the data. It doesn't exactly fit, but it is an interesting
#way to look at the data and shows off some other geoms. This uses ..level.. as the height, but you could
#equally well supply a z vector that recorded counts
p <- ggplot(train.data, aes(x = carat, y = price))
p <- p + geom_point(position = 'jitter',alpha = I(0.3)) # Underlying data set
p <- p + geom_density2d(na.rm = TRUE,bins = 20, size = I(0.5), alpha = I(0.5), aes(color = ..level..)) #Minor contours
p <- p + geom_density2d(na.rm = TRUE,bins = 4, size = I(2), aes(color = ..level..)) # Major contours
p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2))
p <- p + scale_y_log10(name = "Price", label = dollar)
p <- p + scale_color_continuous(name = "Data point density")
p <- p + ggtitle("Density of carats vs. price")
png("contourplot.png",width = 1000, height = 1000) # This will save the plot
print(p)
dev.off()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment