Last active
July 20, 2018 22:10
-
-
Save shawngraham/a2194ce300b3c5634ce8b7b546cf3060 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("tabulizer") | |
# The file could be local, or it could be on the web. | |
# We're going to use | |
# Lane, K., Pomeroy, E. & Davila, M. (2018). Over Rock and Under Stone: Carved Rocks and Subterranean Burials at Kipia, Ancash, AD 1000 – 1532. Open Archaeology, 4(1), pp. 299-321. Retrieved 20 Jul. 2018, from doi:10.1515/opar-2018-0018 | |
location <- 'https://www.degruyter.com/downloadpdf/j/opar.2018.4.issue-1/opar-2018-0018/opar-2018-0018.pdf' | |
# Next, let's grab the data in Table 1. This is on pg 6 of the pdf. | |
# The line below will open a window in the viewer on the right side | |
# of RStudio, and open pg 6 there. Drag and click a box over the | |
# table to extract. Be careful to grab the table, and only the table. | |
data <- extract_areas(location, 6) | |
# We can examine the first bit of data that we've extracted with the 'head' command. | |
head(data) | |
# We can save our data to another variable just in case we mess something up. | |
orig_data <- data # in case of trouble | |
# Now let's clean this up; right now it's a list with text, not a dataframe with numbers. | |
# save header | |
h <- data[[1]][1,] | |
# remove header in the table | |
data[[1]] <- data[[1]][-c(1,2),] | |
# let's examine the data again. | |
head(data) | |
#Notice anything different than last time we ran the head command? | |
# Now we turn it into a dataframe, and tell it that h, the header, is our column headers. | |
datadf <- setNames(as.data.frame(do.call("rbind", data), stringsAsFactors = FALSE), h) | |
str(datadf) | |
View(datadf) | |
# ta da! | |
## Now let's plot something | |
library("ggplot2") | |
# data is all chr, so we have to turn it into numeric | |
#first we look in the C14 column for things that match digits | |
years <- regexpr("[[:digit:]]{3}",datadf[["C14"]]) | |
#then we tell R that those digits are data type numeric | |
datadf$years <- as.numeric(regmatches(datadf[["C14"]], years)) | |
datadf$RegPerc <- as.numeric(gsub("%", "", datadf$Pit)) | |
#now we can make a scatter plot | |
ggplot(datadf, aes(x=Pit, y=C14)) + | |
geom_point(shape=1) | |
# We can get Tabulizer to extract all of the tables it finds automatically. | |
# if the tables are very clean, we can also add the `output = "data.frame"` | |
# argument to go direct to a data frame. Otherwise, leave it off. | |
tab1 <- extract_tables(location, output = "data.frame") | |
# this doesn't always work; let's check the output | |
str(tab1) | |
# You can see that the 1st data.frame only has one observation, so it wasn't | |
# formed in a way that the automatic extraction expected. | |
# The second data.frame looks much more promising, so let's look at it; | |
View(tab1[[2]]) | |
# Excellent! | |
# for more of what you can do with this package, | |
# see https://ropensci.org/blog/2017/04/18/tabulizer/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment