tts/gist:2867937

## gistfile1.r
################################################################
#
#  Two-by-two checkerboard dentograph of library classifications.
#  Example: Vaski
#
#  Tuija Sonkkila
#  2012-06-04
#
#  based on
#
#  On Dentographs, A New Method of Visualizing Library Collections
#  by William Denton
#
#  http://journal.code4lib.org/articles/6300
#
#  R version 2.12.1 (2010-12-16)
#
#  http://data.kirjastot.fi/data.html
#  C0 1.0 Universal (CC0 1.0) Public Domain Dedication
################################################################

library(stringr) #v 0.5
library(lattice)

file.location <- "/home/projektit/libdata/books_vaski_ready_tidy_2.csv"

# read the CSV file
#
# isbn;lang;year;pages;size;c
# NA;fin;1971;NA;NA;26
# NA;fin;1970;NA;NA;25.3
# NA;fin;1962;NA;NA;99.11

v <- read.csv( file.location, sep=";", header=TRUE, stringsAsFactors=FALSE)

# duplicate control
v.uniq <- unique(v)

# take only the c column
vaski <- as.data.frame(v.uniq$c)

# rename the c column
names(vaski) <- c("c")

# spread out the classification number
# 3rd char is either empty or a comma
vaski$c1 <- substring(vaski$c, 1, 2)
vaski$c2 <- substring(vaski$c, 4, 5)

# drop col c
drop.col <- c("c")
vaski <- as.data.frame(vaski[, !(names(vaski) %in% drop.col)])

# keep only rows without chars
vaski <- vaski[grep("[a-zA-Z]", vaski$c1, invert=TRUE), ]
vaski <- vaski[grep("[a-zA-Z]", vaski$c2, invert=TRUE), ]

# NA rows away
vaski <- as.data.frame(vaski[!(is.na(vaski$c1)), ])
vaski <- as.data.frame(vaski[!(is.na(vaski$c2)), ])

# factors
vaski$c1 <- factor(vaski$c1, c("00","01","02","03","04","05","06","07","08","09",10:99))
vaski$c2 <- factor(vaski$c2, c("00","01","02","03","04","05","06","07","08","09",10:99))

# count each combination of factor levels
vaski.table <- table(vaski)

# plot to a file
png("vaski2by2.png")

# draw a plot
palette <- colorRampPalette(c("#eeeeee", "purple"))
levelplot(vaski.table,
    col.regions = palette,
    xlab = "Hundreds and tens",
    ylab = "Two decimals",
    main = "Two-by-two dentograph of Vaski library classification",
    scales=(x=list(at=seq(1, 100, by=10), labels=paste(seq(0, 9), "0", sep=""))),
    panel=function(...){
     panel.levelplot(...);
     panel.abline(h=seq(11,99, by=10), lty="dashed", col="light grey");
     panel.abline(v=seq(11,99, by=10), lty="dashed", col="light grey") }
  )

dev.off()

# where are the most items (row, col)?
which(vaski.table == max(vaski.table), arr.ind=TRUE)
# 85, 32

# and how many are there?
max(vaski.table)
# 18834
	################################################################
	#
	# Two-by-two checkerboard dentograph of library classifications.
	# Example: Vaski
	#
	# Tuija Sonkkila
	# 2012-06-04
	#
	# based on
	#
	# On Dentographs, A New Method of Visualizing Library Collections
	# by William Denton
	#
	# http://journal.code4lib.org/articles/6300
	#
	# R version 2.12.1 (2010-12-16)
	#
	# http://data.kirjastot.fi/data.html
	# C0 1.0 Universal (CC0 1.0) Public Domain Dedication
	################################################################

	library(stringr) #v 0.5
	library(lattice)

	file.location <- "/home/projektit/libdata/books_vaski_ready_tidy_2.csv"

	# read the CSV file
	#
	# isbn;lang;year;pages;size;c
	# NA;fin;1971;NA;NA;26
	# NA;fin;1970;NA;NA;25.3
	# NA;fin;1962;NA;NA;99.11

	v <- read.csv( file.location, sep=";", header=TRUE, stringsAsFactors=FALSE)

	# duplicate control
	v.uniq <- unique(v)

	# take only the c column
	vaski <- as.data.frame(v.uniq$c)

	# rename the c column
	names(vaski) <- c("c")

	# spread out the classification number
	# 3rd char is either empty or a comma
	vaski$c1 <- substring(vaski$c, 1, 2)
	vaski$c2 <- substring(vaski$c, 4, 5)

	# drop col c
	drop.col <- c("c")
	vaski <- as.data.frame(vaski[, !(names(vaski) %in% drop.col)])

	# keep only rows without chars
	vaski <- vaski[grep("[a-zA-Z]", vaski$c1, invert=TRUE), ]
	vaski <- vaski[grep("[a-zA-Z]", vaski$c2, invert=TRUE), ]

	# NA rows away
	vaski <- as.data.frame(vaski[!(is.na(vaski$c1)), ])
	vaski <- as.data.frame(vaski[!(is.na(vaski$c2)), ])

	# factors
	vaski$c1 <- factor(vaski$c1, c("00","01","02","03","04","05","06","07","08","09",10:99))
	vaski$c2 <- factor(vaski$c2, c("00","01","02","03","04","05","06","07","08","09",10:99))

	# count each combination of factor levels
	vaski.table <- table(vaski)

	# plot to a file
	png("vaski2by2.png")

	# draw a plot
	palette <- colorRampPalette(c("#eeeeee", "purple"))
	levelplot(vaski.table,
	col.regions = palette,
	xlab = "Hundreds and tens",
	ylab = "Two decimals",
	main = "Two-by-two dentograph of Vaski library classification",
	scales=(x=list(at=seq(1, 100, by=10), labels=paste(seq(0, 9), "0", sep=""))),
	panel=function(...){
	panel.levelplot(...);
	panel.abline(h=seq(11,99, by=10), lty="dashed", col="light grey");
	panel.abline(v=seq(11,99, by=10), lty="dashed", col="light grey") }
	)

	dev.off()

	# where are the most items (row, col)?
	which(vaski.table == max(vaski.table), arr.ind=TRUE)
	# 85, 32

	# and how many are there?
	max(vaski.table)
	# 18834