Skip to content

Instantly share code, notes, and snippets.

@tts
Created June 11, 2012 12:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tts/2909853 to your computer and use it in GitHub Desktop.
Save tts/2909853 to your computer and use it in GitHub Desktop.
Library statistics from Excel files and visualizing with Google Motion Chart
########################################################
#
# Extracting library statistics
# from tilastot.kirjastot.fi Excel files
# and visualizing them with a Google Motion Chart
#
# Data:
# Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)
#
# Tuija Sonkkila 2012-06-10
#########################################################
library(XML)
library(utils)
library(XLConnect)
library(sqldf)
library(gtools)
library(googleVis)
baseurl <- "http://tilastot.kirjastot.fi"
url <- "http://tilastot.kirjastot.fi/fi-FI/vuositilastot.aspx"
stats.doc <- htmlParse(url)
# From the HTML source code, grab the relative URLs of the Excel files,
# and save them to a data frame
stats <- data.frame(xpathSApply(stats.doc, "//ul/li[contains(a/@href, 'LibraryStat')]/a/@href[contains(., '.xls?')]"))
# Rename column
names(stats) <- "file"
# From the URL, take the year, and save it for later use
stats$y <- gsub(".*Report_Y([0-9]+)N.*", "\\1", stats$file)
# Prepend the URLs with the base, ie make them absolut
stats$file <- paste(baseurl, stats$file, sep = "")
# a small subset for debugging
# two <- stats[1:2, ]
##################
#
# Download files
#
##################
f <- function(x) {
remoteFile <- x[1]
year <- x[2]
localFile <- paste("temp", year, ".xsl" sep = "")
if (file.exists(localFile)) {
print(paste("File", localFile, "is already here", sep = " "))
} else {
download.file(url = remoteFile, destfile = localFile)
}
}
# go through all file names in the data frame, and apply the f function to each of them
apply(stats, 1, f)
# debug
# apply(two, 1, f)
# As its last row, all Excel files have a row that includes cells with sum formulas.
# I'd like to skip over this row, but the row number varies.
# AFAIK it's not possible with the XLConnect library to calculate the last row number
# programmatically, and select all rows before it.
#
# From the library manual I understood that the default behaviour of the
# readWorksheetFromFile function was to ignore formulas and replace these cells with NA.
# This did not happen though. So - I deleted manually sum rows from the local files...
#
# I'm sure there are ways to get this done without manual intervention.
#
# Disclaimer: I may have misunderstood XLConnect. Maybe the onErrorCell
# function needs to be explicitly set? If yes, how?
#
# Enhancement request for the statistics provider:
# put the sum row at the top, and let data rows always start from, say, row 5.
###############################
#
# Import all data to a list
#
###############################
e <- function(x) {
tempFile <- x[3]
if (file.exists(tempFile)) {
readWorksheetFromFile(tempFile, sheet = 1, startRow = 2, endCol = 99)
} else {
print(paste("File", tempFile, "does not exist", sep = " "))
}
}
libstats <- apply(stats, 1, e)
# debug
# libstats <- apply(two, 1, e)
# With my baby R skills I didn't know yet
# how to append every list element (data frame)
# with a year from the sequence of 2011:1999.
# Therefore, here one by one :)
libstats[[1]]$Vuosi <- "2011"
libstats[[2]]$Vuosi <- "2010"
libstats[[3]]$Vuosi <- "2009"
libstats[[4]]$Vuosi <- "2008"
libstats[[5]]$Vuosi <- "2007"
libstats[[6]]$Vuosi <- "2006"
libstats[[7]]$Vuosi <- "2005"
libstats[[8]]$Vuosi <- "2004"
libstats[[9]]$Vuosi <- "2003"
libstats[[10]]$Vuosi <- "2002"
libstats[[11]]$Vuosi <- "2001"
libstats[[12]]$Vuosi <- "2000"
libstats[[13]]$Vuosi <- "1999"
###########################
#
# Construct a data frame
# from the list
#
###########################
# The cbind function throws an error:
# "Error in match.names(clabs, names(xi)):
# names do not match previous names"
#
# AFAIK this happens because column names
# have not been constant over the years.
#
# The smartbind function from gtools library
# merges all rows by force.
#
# However, the result is a number of extra columns, ie
# all column name variants are there.
#
# eg.
# 'Kokoelmat__Muu_aineisto' (older)
# 'Kokoelmat__Muut_aineistot' (newer)
#
# Data in these columns need curation later on
allstats <- do.call(smartbind, libstats)
#################
#
# Data curation
#
#################
# First, tidy long column names for the update
#
# To refer to absolute column numbers cannot be wise.
# Maybe I could live with the original names after all...
names(allstats)[30] <- "KokoelmatMuutAineistot"
names(allstats)[46] <- "HankinnatMuutAineistot"
names(allstats)[63] <- "LainausMuutAineistot"
names(allstats)[104] <- "Laani"
names(allstats)[105] <- "Tyo"
names(allstats)[106] <- "Sq"
names(allstats)[107] <- "KokMuu"
names(allstats)[108] <- "HankMuu"
names(allstats)[109] <- "LaiMuu"
# Copy values from old column variants to new ones
# with the help of the sqldf library. I'm coming
# from the SQL world you see :)
#
# Note that SQLite tables need to be selected
# from the main database after update, see
# http://code.google.com/p/sqldf/#8._Why_am_I_having_problems_with_update?
#
# KokMuu -> KokoelmatMuutAineistot
allstats <- sqldf(c("update allstats set KokoelmatMuutAineistot = KokMuu where (KokoelmatMuutAineistot is null or KokoelmatMuutAineistot=0) and KokMuu is not null","select * from main.allstats"))
# HankMuu -> HankinnatMuutAineistot
allstats <- sqldf(c("update allstats set HankinnatMuutAineistot = HankMuu where (HankinnatMuutAineistot is null or HankinnatMuutAineistot=0) and HankMuu is not null","select * from main.allstats"))
# LaiMuu -> LainausMuutAineistot
allstats <- sqldf(c("update allstats set LainausMuutAineistot = LaiMuu where (LainausMuutAineistot is null or LainausMuutAineistot=0) and LaiMuu is not null","select * from main.allstats"))
# Kokoelmat__DVD_levyt -> Kokoelmat__DVD_ja_Blu_ray__levyt
allstats <- sqldf(c("update allstats set Kokoelmat__DVD_ja_Blu_ray__levyt = Kokoelmat__DVD_levyt where (Kokoelmat__DVD_ja_Blu_ray__levyt is null or Kokoelmat__DVD_ja_Blu_ray__levyt=0) and Kokoelmat__DVD_levyt is not null","select * from main.allstats"))
# Hankinnat__DVD_levyt -> Hankinnat__DVD_ja_Blu_ray__levyt
allstats <- sqldf(c("update allstats set Hankinnat__DVD_ja_Blu_ray__levyt = Hankinnat__DVD_levyt where (Hankinnat__DVD_ja_Blu_ray__levyt is null or Hankinnat__DVD_ja_Blu_ray__levyt=0) and Hankinnat__DVD_levyt is not null","select * from main.allstats"))
# Lainaus__DVD_levyt -> Lainaus__DVD_ja_Blu_ray__levyt
allstats <- sqldf(c("update allstats set Lainaus__DVD_ja_Blu_ray__levyt = Lainaus__DVD_levyt where (Lainaus__DVD_ja_Blu_ray__levyt is null or Lainaus__DVD_ja_Blu_ray__levyt=0) and Lainaus__DVD_levyt is not null","select * from main.allstats"))
# Delete all "extra" columns
dropcols <- c("Laani", "Tyo", "Sq", "KokMuu", "HankMuu", "LaiMuu", "Kokoelmat__DVD_levyt", "Hankinnat__DVD_levyt", "Lainaus__DVD_levyt", "year")
allstats <- allstats[ ,!(names(allstats) %in% dropcols)]
# Change the data type of Vuosi from character to Date
allstats$Vuosi <- as.Date(allstats$Vuosi, "%Y")
allstats$Vuosi <- as.numeric(format(allstats$Vuosi, "%Y"))
# Delete rows with 'See'-references
allstats <- allstats[grep(" Ks. ", allstats$Kunta, invert = TRUE), ]
# Google Motion Charts requires the first columns to be in a specific order
#
# 1 idvar =Kunta (1)
# 2 timevar =Vuosi (100)
# 3 bubble colour =Suuralue (94)
# 4 x var =Lainaus (48)
# 5 y var =Hankinnat (31)
# 6 bubble size =Asukasluku (2)
#
# http://stackoverflow.com/questions/10258970/default-variables-for-a-googlevis-motionchart?rq=1
finalstats <- allstats[c(1, 100, 94, 48, 31, 2, 3:30, 32:47, 49:93, 95:99, 101)]
###############
#
# MotionChart
#
###############
MC <- gvisMotionChart(finalstats, idvar = "Kunta", timevar = "Vuosi",
options = list(height = 400, width = 550),
chartid = "Kirjastotilastot")
plot(MC)
# My intention is to include the chart to a Google Blogger blog post.
# The chart cannot be embedded there as such though, so I'll follow
# the advice given in the URL below, and fetch the whole thing from my Dropbox account
# to an iframe element
# http://lamages.blogspot.fi/2011/09/including-googlevis-output-into-blogger.html
print(MC, file = "libstat.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment