tts/gist:4737987

## gistfile1.r
###############################################################################
#
# Coauthoring in Aalto University publications, case professors.
#
# Exporting .graphml from R, visualizing in Gephi.
#
# Data: data.aalto.fi
#
# Tuija Sonkkila 11.2.2013
#                12.6.2013 (added note about querying a local RDF file, downloaded from data.aalto.fi)
#
# Most of the code is adapted from
# http://linkedscience.org/tools/sparql-package-for-r/
# sparql-package-for-r-gephi-movie-star-graph-visualization-tutorial/
#
# Comments in quotes are from the above URL as well. I have just replaced
# actor->author and movie->title
#
# Plot parameter hints in R see e.g.
# http://rdatamining.wordpress.com/2012/05/17/
# an-example-of-social-network-analysis-with-r-using-package-igraph/
#
########################################################################
library(SPARQL)
library(igraph)
library(network)
library(ergm)

endpoint <- "http://data.aalto.fi/sparql"

baseq <- "SELECT ?author ?title ?year
WHERE {
GRAPH <http://data.aalto.fi/id/publications/tkkjulkaisee/> {
?publication <http://purl.org/dc/terms/title> ?title ;
             <http://purl.org/dc/terms/date> ?year ;
             <http://purl.org/ontology/bibo/authorList> ?list .
?list <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> */
      <http://www.w3.org/1999/02/22-rdf-syntax-ns#first> ?member .
?member <http://xmlns.com/foaf/0.1/name> ?author .
"

##########################################################################################
#
#  12.6.2013
#
#  If you have downloaded all publication data as RDF/XML to the working directory
#  (which makes sense because at the moment, the content is not updated), then:
#
#  library(rrdf)
#
#  m1 = load.rdf("publ.rdf")
#  summarize.rdf(m1)
#  #"Number of triples: 1076965"
#
#  # This makes a matrix. The query is of course the same
#  all <- sparql.rdf(m1, "SELECT ...  ")
#
#  alldf <- as.data.frame(all, stringsAsFactors=FALSE)
#
#  and continue with this data frame as below with 'all'
#
###########################################################################################

# Querying in chunks so as not to overload the service
q1 <- paste(baseq, "FILTER (regex(?year, '19.*'))}}", sep = "")
q2 <- paste(baseq, "FILTER (regex(?year, '200[012345]'))}}", sep = "")
q3 <- paste(baseq, "FILTER (regex(?year, '200[6789]'))}}", sep = "")
q4 <- paste(baseq, "FILTER (regex(?year, '201[012]'))}}", sep = "")

all.19  <- SPARQL(url = endpoint, q1)$results
all.20a <- SPARQL(url = endpoint, q2)$results
all.20b <- SPARQL(url = endpoint, q3)$results
all.20c <- SPARQL(url = endpoint, q4)$results

all <- rbind(all.19, all.20a, all.20b, all.20c)


####################################
#
# Selection of prominent professors
# from different fields of STM
#
####################################

temp <- all[all$author == 'Hari, Riitta', ]
temp2 <- all[all$author == 'Smeds, Riitta', ]
temp3 <- all[all$author == 'Karppinen, Maarit', ]
temp4 <- all[all$author == 'Varis, Olli', ]
temp5 <- all[all$author == 'Hallikainen, Martti', ]

hari.u <- unique(temp[ ,c("title")]) # some duplicates
smeds.u <- unique(temp2[, c("title")])
karppinen.u <- unique(temp3[, c("title")])
varis.u <- unique(temp4[, c("title")])
hallikainen.u <- unique(temp5[, c("title")])

# All authors, titles, and years of publications where she/he is an author
hari <- all[all$title %in% hari.u, ]
smeds <- all[all$title %in% smeds.u, ]
karppinen <- all[all$title %in% karppinen.u, ]
varis <- all[all$title %in% varis.u, ]
hallikainen <- all[all$title %in% hallikainen.u, ]


###########################################################
#
# Function coauth
#
# Representing coauthor relations
#
###########################################################
coauth <- function(df, col1, col2) {
  author <- col1
  title <- col2

  # "We want a matrix from authors to titles, containing
  # 1 when a person is among the authors of the publication, and
  # 0 whe he does not"
  author_title_matrix <- as.matrix(ifelse(table(df$author, df$title) > 0, 1, 0))

  # "The first step is to construct a adjacency graph of coauthoring relations.
  # We can accomplish this by taking the author-title matrix,
  # transposing it to a title-author matrix and then multiplying it with itself.
  # This way we can traverse from authors to titles and then from
  # titles to authors again, producing author-author relations."
  coauthor_matrix <- author_title_matrix %*% t(author_title_matrix)

  # "We are not interested in reflexive links, because it is obvious
  # that everybody coauthors with himself.
  # So let’s remove the numbers on the diagonal of the author-author matrix."
  #
  # This is VERY memory-intensive with big matrixes - not this one though.
  # If in trouble, remove all unnecessary objects, and run gc()
  diag(coauthor_matrix) <- 0

  # "We are dealing with an adjacency (or transition) matrix,
  # which is a 1-mode graph"
  a_a <- graph.adjacency(coauthor_matrix,weighted = TRUE, mode = "undirected")

  # "Computing some SNA properties, like betweenness,
  # edge betweenness and eigenvector centrality (Google PageRank is a kind of eigenvector centrality),
  # and assign them to the nodes with a suitably named attributes."
  V(a_a)$betweenness <- betweenness(a_a, directed = FALSE, normalized = TRUE)
  E(a_a)$betweenness <- edge.betweenness(a_a, directed = FALSE)
  V(a_a)$centrality <- evcent(a_a, weights = V(a_a)$weight)$vector

  # "Let’s calculate modularity using Leading Eigenvector communities.
  # We store the number of the community each vertex belongs to in the vertex attribute “lec_community”.
  # Then we can use this attribute in Gephi to color the nodes."
  V(a_a)$lec_community <- as.character(leading.eigenvector.community(a_a)$membership)
  E(a_a)$Weight <- E(a_a)$weight

  # Gephi doesn't handle non-ASCII so some very rough and dirty cleaning below.
  # Note that some visible names may need to be manually edited in the Gephi
  # Data Laboratory view.
  #
  # EDIT 13.2.2013: seems that the culprit was my Linux installation, not Gephi.
  # In Windows7 environment, UTF-8 works fine all the way through, so there is no need
  # to venture into these ugly character modifications below.
  V(a_a)$name <- sub("Ã¶", "o", V(a_a)$name)
  V(a_a)$name <- sub("Ã¤", "a", V(a_a)$name)
  V(a_a)$name <- sub("Ã©", "e", V(a_a)$name)
  V(a_a)$name <- iconv(V(a_a)$name, to = 'ASCII', sub = "")
  V(a_a)$Label <- V(a_a)$name

  f <- paste("coauthoring_aalto_", deparse(substitute(df)), ".graphml", sep = "")
  write.graph(a_a, file = f, format = "graphml")
}

coauth(hari, c("author"), c("title"))
coauth(smeds, c("author"), c("title"))
coauth(karppinen, c("author"), c("title"))
coauth(varis, c("author"), c("title"))
coauth(hallikainen, c("author"), c("title"))


########################################
#
# Author statistics
#
########################################
library(plyr)

# Number of authors per publication
agg <- ddply(all, .(year, title), summarise, n = length(author))

# Mean number of authors (per publication) per year
agg2 <- ddply(agg, .(year), summarise, mean = round(mean(n), digits = 1))
agg2 <- agg2[!is.na(agg2$year), ]

######################
#
#  Case professors
#
######################

hari.agg <- ddply(hari, .(year, title), summarise, n = length(author))
hari.agg2 <- ddply(hari.agg, .(year), summarise, mean = round(mean(n), digits = 1))

smeds.agg <- ddply(smeds, .(year, title), summarise, n = length(author))
smeds.agg2 <- ddply(smeds.agg, .(year), summarise, mean = round(mean(n), digits = 1))

karppinen.agg <- ddply(karppinen, .(year, title), summarise, n = length(author))
karppinen.agg2 <- ddply(karppinen.agg, .(year), summarise, mean = round(mean(n), digits = 1))

varis.agg <- ddply(varis, .(year, title), summarise, n = length(author))
varis.agg2 <- ddply(varis.agg, .(year), summarise, mean = round(mean(n), digits = 1))

hallikainen.agg <- ddply(hallikainen, .(year, title), summarise, n = length(author))
hallikainen.agg2 <- ddply(hallikainen.agg, .(year), summarise, mean = round(mean(n), digits = 1))

png("aalto.auth.publ.png", width = 1024, height = 768, res = 72)

plot(agg2,
     main = "Mean number of authors per publication",
     xlab = "Year",
     ylab = "Authors",
     col = "magenta",
     ylim = c(1, 11),
     type = "l",
     lwd = 1,
     lty = "dashed")


lines(x = hallikainen.agg2$year,
      y = hallikainen.agg2$mean,
      col = "red",
      lwd = 2,
      lty = "solid")

lines(x = hari.agg2$year,
      y = hari.agg2$mean,
      col = "blue",
      lwd = 2,
      lty = "solid")

lines(x = karppinen.agg2$year,
      y = karppinen.agg2$mean,
      col = "grey",
      lwd = 2,
      lty = "solid")

lines(x = smeds.agg2$year,
      y = smeds.agg2$mean,
      col = "darkolivegreen",
      lwd = 2,
      lty = "solid")

lines(x = varis.agg2$year,
      y = varis.agg2$mean,
      col = "black",
      lwd = 2,
      lty = "solid")


legend("topleft", inset = c(.05,.08),
       c("Aalto Univ", "Hallikainen", "Hari", "Karppinen", "Smeds",  "Varis"),
       lwd = c(1, 2, 2, 2, 2, 2),
       lty = c("dashed", "solid", "solid", "solid", "solid", "solid"),
       col = c("magenta", "red", "blue", "grey", "darkolivegreen", "black"))

dev.off()
	###############################################################################
	#
	# Coauthoring in Aalto University publications, case professors.
	#
	# Exporting .graphml from R, visualizing in Gephi.
	#
	# Data: data.aalto.fi
	#
	# Tuija Sonkkila 11.2.2013
	# 12.6.2013 (added note about querying a local RDF file, downloaded from data.aalto.fi)
	#
	# Most of the code is adapted from
	# http://linkedscience.org/tools/sparql-package-for-r/
	# sparql-package-for-r-gephi-movie-star-graph-visualization-tutorial/
	#
	# Comments in quotes are from the above URL as well. I have just replaced
	# actor->author and movie->title
	#
	# Plot parameter hints in R see e.g.
	# http://rdatamining.wordpress.com/2012/05/17/
	# an-example-of-social-network-analysis-with-r-using-package-igraph/
	#
	########################################################################
	library(SPARQL)
	library(igraph)
	library(network)
	library(ergm)

	endpoint <- "http://data.aalto.fi/sparql"

	baseq <- "SELECT ?author ?title ?year
	WHERE {
	GRAPH <http://data.aalto.fi/id/publications/tkkjulkaisee/> {
	?publication <http://purl.org/dc/terms/title> ?title ;
	<http://purl.org/dc/terms/date> ?year ;
	<http://purl.org/ontology/bibo/authorList> ?list .
	?list <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> */
	<http://www.w3.org/1999/02/22-rdf-syntax-ns#first> ?member .
	?member <http://xmlns.com/foaf/0.1/name> ?author .
	"

	##########################################################################################
	#
	# 12.6.2013
	#
	# If you have downloaded all publication data as RDF/XML to the working directory
	# (which makes sense because at the moment, the content is not updated), then:
	#
	# library(rrdf)
	#
	# m1 = load.rdf("publ.rdf")
	# summarize.rdf(m1)
	# #"Number of triples: 1076965"
	#
	# # This makes a matrix. The query is of course the same
	# all <- sparql.rdf(m1, "SELECT ... ")
	#
	# alldf <- as.data.frame(all, stringsAsFactors=FALSE)
	#
	# and continue with this data frame as below with 'all'
	#
	###########################################################################################

	# Querying in chunks so as not to overload the service
	q1 <- paste(baseq, "FILTER (regex(?year, '19.*'))}}", sep = "")
	q2 <- paste(baseq, "FILTER (regex(?year, '200[012345]'))}}", sep = "")
	q3 <- paste(baseq, "FILTER (regex(?year, '200[6789]'))}}", sep = "")
	q4 <- paste(baseq, "FILTER (regex(?year, '201[012]'))}}", sep = "")

	all.19 <- SPARQL(url = endpoint, q1)$results
	all.20a <- SPARQL(url = endpoint, q2)$results
	all.20b <- SPARQL(url = endpoint, q3)$results
	all.20c <- SPARQL(url = endpoint, q4)$results

	all <- rbind(all.19, all.20a, all.20b, all.20c)


	####################################
	#
	# Selection of prominent professors
	# from different fields of STM
	#
	####################################

	temp <- all[all$author == 'Hari, Riitta', ]
	temp2 <- all[all$author == 'Smeds, Riitta', ]
	temp3 <- all[all$author == 'Karppinen, Maarit', ]
	temp4 <- all[all$author == 'Varis, Olli', ]
	temp5 <- all[all$author == 'Hallikainen, Martti', ]

	hari.u <- unique(temp[ ,c("title")]) # some duplicates
	smeds.u <- unique(temp2[, c("title")])
	karppinen.u <- unique(temp3[, c("title")])
	varis.u <- unique(temp4[, c("title")])
	hallikainen.u <- unique(temp5[, c("title")])

	# All authors, titles, and years of publications where she/he is an author
	hari <- all[all$title %in% hari.u, ]
	smeds <- all[all$title %in% smeds.u, ]
	karppinen <- all[all$title %in% karppinen.u, ]
	varis <- all[all$title %in% varis.u, ]
	hallikainen <- all[all$title %in% hallikainen.u, ]


	###########################################################
	#
	# Function coauth
	#
	# Representing coauthor relations
	#
	###########################################################
	coauth <- function(df, col1, col2) {
	author <- col1
	title <- col2

	# "We want a matrix from authors to titles, containing
	# 1 when a person is among the authors of the publication, and
	# 0 whe he does not"
	author_title_matrix <- as.matrix(ifelse(table(df$author, df$title) > 0, 1, 0))

	# "The first step is to construct a adjacency graph of coauthoring relations.
	# We can accomplish this by taking the author-title matrix,
	# transposing it to a title-author matrix and then multiplying it with itself.
	# This way we can traverse from authors to titles and then from
	# titles to authors again, producing author-author relations."
	coauthor_matrix <- author_title_matrix %*% t(author_title_matrix)

	# "We are not interested in reflexive links, because it is obvious
	# that everybody coauthors with himself.
	# So let’s remove the numbers on the diagonal of the author-author matrix."
	#
	# This is VERY memory-intensive with big matrixes - not this one though.
	# If in trouble, remove all unnecessary objects, and run gc()
	diag(coauthor_matrix) <- 0

	# "We are dealing with an adjacency (or transition) matrix,
	# which is a 1-mode graph"
	a_a <- graph.adjacency(coauthor_matrix,weighted = TRUE, mode = "undirected")

	# "Computing some SNA properties, like betweenness,
	# edge betweenness and eigenvector centrality (Google PageRank is a kind of eigenvector centrality),
	# and assign them to the nodes with a suitably named attributes."
	V(a_a)$betweenness <- betweenness(a_a, directed = FALSE, normalized = TRUE)
	E(a_a)$betweenness <- edge.betweenness(a_a, directed = FALSE)
	V(a_a)$centrality <- evcent(a_a, weights = V(a_a)$weight)$vector

	# "Let’s calculate modularity using Leading Eigenvector communities.
	# We store the number of the community each vertex belongs to in the vertex attribute “lec_community”.
	# Then we can use this attribute in Gephi to color the nodes."
	V(a_a)$lec_community <- as.character(leading.eigenvector.community(a_a)$membership)
	E(a_a)$Weight <- E(a_a)$weight

	# Gephi doesn't handle non-ASCII so some very rough and dirty cleaning below.
	# Note that some visible names may need to be manually edited in the Gephi
	# Data Laboratory view.
	#
	# EDIT 13.2.2013: seems that the culprit was my Linux installation, not Gephi.
	# In Windows7 environment, UTF-8 works fine all the way through, so there is no need
	# to venture into these ugly character modifications below.
	V(a_a)$name <- sub("Ã¶", "o", V(a_a)$name)
	V(a_a)$name <- sub("Ã¤", "a", V(a_a)$name)
	V(a_a)$name <- sub("Ã©", "e", V(a_a)$name)
	V(a_a)$name <- iconv(V(a_a)$name, to = 'ASCII', sub = "")
	V(a_a)$Label <- V(a_a)$name

	f <- paste("coauthoring_aalto_", deparse(substitute(df)), ".graphml", sep = "")
	write.graph(a_a, file = f, format = "graphml")
	}

	coauth(hari, c("author"), c("title"))
	coauth(smeds, c("author"), c("title"))
	coauth(karppinen, c("author"), c("title"))
	coauth(varis, c("author"), c("title"))
	coauth(hallikainen, c("author"), c("title"))


	########################################
	#
	# Author statistics
	#
	########################################
	library(plyr)

	# Number of authors per publication
	agg <- ddply(all, .(year, title), summarise, n = length(author))

	# Mean number of authors (per publication) per year
	agg2 <- ddply(agg, .(year), summarise, mean = round(mean(n), digits = 1))
	agg2 <- agg2[!is.na(agg2$year), ]

	######################
	#
	# Case professors
	#
	######################

	hari.agg <- ddply(hari, .(year, title), summarise, n = length(author))
	hari.agg2 <- ddply(hari.agg, .(year), summarise, mean = round(mean(n), digits = 1))

	smeds.agg <- ddply(smeds, .(year, title), summarise, n = length(author))
	smeds.agg2 <- ddply(smeds.agg, .(year), summarise, mean = round(mean(n), digits = 1))

	karppinen.agg <- ddply(karppinen, .(year, title), summarise, n = length(author))
	karppinen.agg2 <- ddply(karppinen.agg, .(year), summarise, mean = round(mean(n), digits = 1))

	varis.agg <- ddply(varis, .(year, title), summarise, n = length(author))
	varis.agg2 <- ddply(varis.agg, .(year), summarise, mean = round(mean(n), digits = 1))

	hallikainen.agg <- ddply(hallikainen, .(year, title), summarise, n = length(author))
	hallikainen.agg2 <- ddply(hallikainen.agg, .(year), summarise, mean = round(mean(n), digits = 1))

	png("aalto.auth.publ.png", width = 1024, height = 768, res = 72)

	plot(agg2,
	main = "Mean number of authors per publication",
	xlab = "Year",
	ylab = "Authors",
	col = "magenta",
	ylim = c(1, 11),
	type = "l",
	lwd = 1,
	lty = "dashed")


	lines(x = hallikainen.agg2$year,
	y = hallikainen.agg2$mean,
	col = "red",
	lwd = 2,
	lty = "solid")

	lines(x = hari.agg2$year,
	y = hari.agg2$mean,
	col = "blue",
	lwd = 2,
	lty = "solid")

	lines(x = karppinen.agg2$year,
	y = karppinen.agg2$mean,
	col = "grey",
	lwd = 2,
	lty = "solid")

	lines(x = smeds.agg2$year,
	y = smeds.agg2$mean,
	col = "darkolivegreen",
	lwd = 2,
	lty = "solid")

	lines(x = varis.agg2$year,
	y = varis.agg2$mean,
	col = "black",
	lwd = 2,
	lty = "solid")


	legend("topleft", inset = c(.05,.08),
	c("Aalto Univ", "Hallikainen", "Hari", "Karppinen", "Smeds", "Varis"),
	lwd = c(1, 2, 2, 2, 2, 2),
	lty = c("dashed", "solid", "solid", "solid", "solid", "solid"),
	col = c("magenta", "red", "blue", "grey", "darkolivegreen", "black"))

	dev.off()