Created
June 11, 2012 12:24
-
-
Save tts/2909853 to your computer and use it in GitHub Desktop.
Library statistics from Excel files and visualizing with Google Motion Chart
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################## | |
# | |
# Extracting library statistics | |
# from tilastot.kirjastot.fi Excel files | |
# and visualizing them with a Google Motion Chart | |
# | |
# Data: | |
# Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0) | |
# | |
# Tuija Sonkkila 2012-06-10 | |
######################################################### | |
library(XML) | |
library(utils) | |
library(XLConnect) | |
library(sqldf) | |
library(gtools) | |
library(googleVis) | |
baseurl <- "http://tilastot.kirjastot.fi" | |
url <- "http://tilastot.kirjastot.fi/fi-FI/vuositilastot.aspx" | |
stats.doc <- htmlParse(url) | |
# From the HTML source code, grab the relative URLs of the Excel files, | |
# and save them to a data frame | |
stats <- data.frame(xpathSApply(stats.doc, "//ul/li[contains(a/@href, 'LibraryStat')]/a/@href[contains(., '.xls?')]")) | |
# Rename column | |
names(stats) <- "file" | |
# From the URL, take the year, and save it for later use | |
stats$y <- gsub(".*Report_Y([0-9]+)N.*", "\\1", stats$file) | |
# Prepend the URLs with the base, ie make them absolut | |
stats$file <- paste(baseurl, stats$file, sep = "") | |
# a small subset for debugging | |
# two <- stats[1:2, ] | |
################## | |
# | |
# Download files | |
# | |
################## | |
f <- function(x) { | |
remoteFile <- x[1] | |
year <- x[2] | |
localFile <- paste("temp", year, ".xsl" sep = "") | |
if (file.exists(localFile)) { | |
print(paste("File", localFile, "is already here", sep = " ")) | |
} else { | |
download.file(url = remoteFile, destfile = localFile) | |
} | |
} | |
# go through all file names in the data frame, and apply the f function to each of them | |
apply(stats, 1, f) | |
# debug | |
# apply(two, 1, f) | |
# As its last row, all Excel files have a row that includes cells with sum formulas. | |
# I'd like to skip over this row, but the row number varies. | |
# AFAIK it's not possible with the XLConnect library to calculate the last row number | |
# programmatically, and select all rows before it. | |
# | |
# From the library manual I understood that the default behaviour of the | |
# readWorksheetFromFile function was to ignore formulas and replace these cells with NA. | |
# This did not happen though. So - I deleted manually sum rows from the local files... | |
# | |
# I'm sure there are ways to get this done without manual intervention. | |
# | |
# Disclaimer: I may have misunderstood XLConnect. Maybe the onErrorCell | |
# function needs to be explicitly set? If yes, how? | |
# | |
# Enhancement request for the statistics provider: | |
# put the sum row at the top, and let data rows always start from, say, row 5. | |
############################### | |
# | |
# Import all data to a list | |
# | |
############################### | |
e <- function(x) { | |
tempFile <- x[3] | |
if (file.exists(tempFile)) { | |
readWorksheetFromFile(tempFile, sheet = 1, startRow = 2, endCol = 99) | |
} else { | |
print(paste("File", tempFile, "does not exist", sep = " ")) | |
} | |
} | |
libstats <- apply(stats, 1, e) | |
# debug | |
# libstats <- apply(two, 1, e) | |
# With my baby R skills I didn't know yet | |
# how to append every list element (data frame) | |
# with a year from the sequence of 2011:1999. | |
# Therefore, here one by one :) | |
libstats[[1]]$Vuosi <- "2011" | |
libstats[[2]]$Vuosi <- "2010" | |
libstats[[3]]$Vuosi <- "2009" | |
libstats[[4]]$Vuosi <- "2008" | |
libstats[[5]]$Vuosi <- "2007" | |
libstats[[6]]$Vuosi <- "2006" | |
libstats[[7]]$Vuosi <- "2005" | |
libstats[[8]]$Vuosi <- "2004" | |
libstats[[9]]$Vuosi <- "2003" | |
libstats[[10]]$Vuosi <- "2002" | |
libstats[[11]]$Vuosi <- "2001" | |
libstats[[12]]$Vuosi <- "2000" | |
libstats[[13]]$Vuosi <- "1999" | |
########################### | |
# | |
# Construct a data frame | |
# from the list | |
# | |
########################### | |
# The cbind function throws an error: | |
# "Error in match.names(clabs, names(xi)): | |
# names do not match previous names" | |
# | |
# AFAIK this happens because column names | |
# have not been constant over the years. | |
# | |
# The smartbind function from gtools library | |
# merges all rows by force. | |
# | |
# However, the result is a number of extra columns, ie | |
# all column name variants are there. | |
# | |
# eg. | |
# 'Kokoelmat__Muu_aineisto' (older) | |
# 'Kokoelmat__Muut_aineistot' (newer) | |
# | |
# Data in these columns need curation later on | |
allstats <- do.call(smartbind, libstats) | |
################# | |
# | |
# Data curation | |
# | |
################# | |
# First, tidy long column names for the update | |
# | |
# To refer to absolute column numbers cannot be wise. | |
# Maybe I could live with the original names after all... | |
names(allstats)[30] <- "KokoelmatMuutAineistot" | |
names(allstats)[46] <- "HankinnatMuutAineistot" | |
names(allstats)[63] <- "LainausMuutAineistot" | |
names(allstats)[104] <- "Laani" | |
names(allstats)[105] <- "Tyo" | |
names(allstats)[106] <- "Sq" | |
names(allstats)[107] <- "KokMuu" | |
names(allstats)[108] <- "HankMuu" | |
names(allstats)[109] <- "LaiMuu" | |
# Copy values from old column variants to new ones | |
# with the help of the sqldf library. I'm coming | |
# from the SQL world you see :) | |
# | |
# Note that SQLite tables need to be selected | |
# from the main database after update, see | |
# http://code.google.com/p/sqldf/#8._Why_am_I_having_problems_with_update? | |
# | |
# KokMuu -> KokoelmatMuutAineistot | |
allstats <- sqldf(c("update allstats set KokoelmatMuutAineistot = KokMuu where (KokoelmatMuutAineistot is null or KokoelmatMuutAineistot=0) and KokMuu is not null","select * from main.allstats")) | |
# HankMuu -> HankinnatMuutAineistot | |
allstats <- sqldf(c("update allstats set HankinnatMuutAineistot = HankMuu where (HankinnatMuutAineistot is null or HankinnatMuutAineistot=0) and HankMuu is not null","select * from main.allstats")) | |
# LaiMuu -> LainausMuutAineistot | |
allstats <- sqldf(c("update allstats set LainausMuutAineistot = LaiMuu where (LainausMuutAineistot is null or LainausMuutAineistot=0) and LaiMuu is not null","select * from main.allstats")) | |
# Kokoelmat__DVD_levyt -> Kokoelmat__DVD_ja_Blu_ray__levyt | |
allstats <- sqldf(c("update allstats set Kokoelmat__DVD_ja_Blu_ray__levyt = Kokoelmat__DVD_levyt where (Kokoelmat__DVD_ja_Blu_ray__levyt is null or Kokoelmat__DVD_ja_Blu_ray__levyt=0) and Kokoelmat__DVD_levyt is not null","select * from main.allstats")) | |
# Hankinnat__DVD_levyt -> Hankinnat__DVD_ja_Blu_ray__levyt | |
allstats <- sqldf(c("update allstats set Hankinnat__DVD_ja_Blu_ray__levyt = Hankinnat__DVD_levyt where (Hankinnat__DVD_ja_Blu_ray__levyt is null or Hankinnat__DVD_ja_Blu_ray__levyt=0) and Hankinnat__DVD_levyt is not null","select * from main.allstats")) | |
# Lainaus__DVD_levyt -> Lainaus__DVD_ja_Blu_ray__levyt | |
allstats <- sqldf(c("update allstats set Lainaus__DVD_ja_Blu_ray__levyt = Lainaus__DVD_levyt where (Lainaus__DVD_ja_Blu_ray__levyt is null or Lainaus__DVD_ja_Blu_ray__levyt=0) and Lainaus__DVD_levyt is not null","select * from main.allstats")) | |
# Delete all "extra" columns | |
dropcols <- c("Laani", "Tyo", "Sq", "KokMuu", "HankMuu", "LaiMuu", "Kokoelmat__DVD_levyt", "Hankinnat__DVD_levyt", "Lainaus__DVD_levyt", "year") | |
allstats <- allstats[ ,!(names(allstats) %in% dropcols)] | |
# Change the data type of Vuosi from character to Date | |
allstats$Vuosi <- as.Date(allstats$Vuosi, "%Y") | |
allstats$Vuosi <- as.numeric(format(allstats$Vuosi, "%Y")) | |
# Delete rows with 'See'-references | |
allstats <- allstats[grep(" Ks. ", allstats$Kunta, invert = TRUE), ] | |
# Google Motion Charts requires the first columns to be in a specific order | |
# | |
# 1 idvar =Kunta (1) | |
# 2 timevar =Vuosi (100) | |
# 3 bubble colour =Suuralue (94) | |
# 4 x var =Lainaus (48) | |
# 5 y var =Hankinnat (31) | |
# 6 bubble size =Asukasluku (2) | |
# | |
# http://stackoverflow.com/questions/10258970/default-variables-for-a-googlevis-motionchart?rq=1 | |
finalstats <- allstats[c(1, 100, 94, 48, 31, 2, 3:30, 32:47, 49:93, 95:99, 101)] | |
############### | |
# | |
# MotionChart | |
# | |
############### | |
MC <- gvisMotionChart(finalstats, idvar = "Kunta", timevar = "Vuosi", | |
options = list(height = 400, width = 550), | |
chartid = "Kirjastotilastot") | |
plot(MC) | |
# My intention is to include the chart to a Google Blogger blog post. | |
# The chart cannot be embedded there as such though, so I'll follow | |
# the advice given in the URL below, and fetch the whole thing from my Dropbox account | |
# to an iframe element | |
# http://lamages.blogspot.fi/2011/09/including-googlevis-output-into-blogger.html | |
print(MC, file = "libstat.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment