Skip to content

Instantly share code, notes, and snippets.

@tts
Last active December 12, 2015 07:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tts/4737987 to your computer and use it in GitHub Desktop.
Save tts/4737987 to your computer and use it in GitHub Desktop.
Coauthoring in Aalto University publications, case some professors.
###############################################################################
#
# Coauthoring in Aalto University publications, case professors.
#
# Exporting .graphml from R, visualizing in Gephi.
#
# Data: data.aalto.fi
#
# Tuija Sonkkila 11.2.2013
# 12.6.2013 (added note about querying a local RDF file, downloaded from data.aalto.fi)
#
# Most of the code is adapted from
# http://linkedscience.org/tools/sparql-package-for-r/
# sparql-package-for-r-gephi-movie-star-graph-visualization-tutorial/
#
# Comments in quotes are from the above URL as well. I have just replaced
# actor->author and movie->title
#
# Plot parameter hints in R see e.g.
# http://rdatamining.wordpress.com/2012/05/17/
# an-example-of-social-network-analysis-with-r-using-package-igraph/
#
########################################################################
library(SPARQL)
library(igraph)
library(network)
library(ergm)
endpoint <- "http://data.aalto.fi/sparql"
baseq <- "SELECT ?author ?title ?year
WHERE {
GRAPH <http://data.aalto.fi/id/publications/tkkjulkaisee/> {
?publication <http://purl.org/dc/terms/title> ?title ;
<http://purl.org/dc/terms/date> ?year ;
<http://purl.org/ontology/bibo/authorList> ?list .
?list <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> */
<http://www.w3.org/1999/02/22-rdf-syntax-ns#first> ?member .
?member <http://xmlns.com/foaf/0.1/name> ?author .
"
##########################################################################################
#
# 12.6.2013
#
# If you have downloaded all publication data as RDF/XML to the working directory
# (which makes sense because at the moment, the content is not updated), then:
#
# library(rrdf)
#
# m1 = load.rdf("publ.rdf")
# summarize.rdf(m1)
# #"Number of triples: 1076965"
#
# # This makes a matrix. The query is of course the same
# all <- sparql.rdf(m1, "SELECT ... ")
#
# alldf <- as.data.frame(all, stringsAsFactors=FALSE)
#
# and continue with this data frame as below with 'all'
#
###########################################################################################
# Querying in chunks so as not to overload the service
q1 <- paste(baseq, "FILTER (regex(?year, '19.*'))}}", sep = "")
q2 <- paste(baseq, "FILTER (regex(?year, '200[012345]'))}}", sep = "")
q3 <- paste(baseq, "FILTER (regex(?year, '200[6789]'))}}", sep = "")
q4 <- paste(baseq, "FILTER (regex(?year, '201[012]'))}}", sep = "")
all.19 <- SPARQL(url = endpoint, q1)$results
all.20a <- SPARQL(url = endpoint, q2)$results
all.20b <- SPARQL(url = endpoint, q3)$results
all.20c <- SPARQL(url = endpoint, q4)$results
all <- rbind(all.19, all.20a, all.20b, all.20c)
####################################
#
# Selection of prominent professors
# from different fields of STM
#
####################################
temp <- all[all$author == 'Hari, Riitta', ]
temp2 <- all[all$author == 'Smeds, Riitta', ]
temp3 <- all[all$author == 'Karppinen, Maarit', ]
temp4 <- all[all$author == 'Varis, Olli', ]
temp5 <- all[all$author == 'Hallikainen, Martti', ]
hari.u <- unique(temp[ ,c("title")]) # some duplicates
smeds.u <- unique(temp2[, c("title")])
karppinen.u <- unique(temp3[, c("title")])
varis.u <- unique(temp4[, c("title")])
hallikainen.u <- unique(temp5[, c("title")])
# All authors, titles, and years of publications where she/he is an author
hari <- all[all$title %in% hari.u, ]
smeds <- all[all$title %in% smeds.u, ]
karppinen <- all[all$title %in% karppinen.u, ]
varis <- all[all$title %in% varis.u, ]
hallikainen <- all[all$title %in% hallikainen.u, ]
###########################################################
#
# Function coauth
#
# Representing coauthor relations
#
###########################################################
coauth <- function(df, col1, col2) {
author <- col1
title <- col2
# "We want a matrix from authors to titles, containing
# 1 when a person is among the authors of the publication, and
# 0 whe he does not"
author_title_matrix <- as.matrix(ifelse(table(df$author, df$title) > 0, 1, 0))
# "The first step is to construct a adjacency graph of coauthoring relations.
# We can accomplish this by taking the author-title matrix,
# transposing it to a title-author matrix and then multiplying it with itself.
# This way we can traverse from authors to titles and then from
# titles to authors again, producing author-author relations."
coauthor_matrix <- author_title_matrix %*% t(author_title_matrix)
# "We are not interested in reflexive links, because it is obvious
# that everybody coauthors with himself.
# So let’s remove the numbers on the diagonal of the author-author matrix."
#
# This is VERY memory-intensive with big matrixes - not this one though.
# If in trouble, remove all unnecessary objects, and run gc()
diag(coauthor_matrix) <- 0
# "We are dealing with an adjacency (or transition) matrix,
# which is a 1-mode graph"
a_a <- graph.adjacency(coauthor_matrix,weighted = TRUE, mode = "undirected")
# "Computing some SNA properties, like betweenness,
# edge betweenness and eigenvector centrality (Google PageRank is a kind of eigenvector centrality),
# and assign them to the nodes with a suitably named attributes."
V(a_a)$betweenness <- betweenness(a_a, directed = FALSE, normalized = TRUE)
E(a_a)$betweenness <- edge.betweenness(a_a, directed = FALSE)
V(a_a)$centrality <- evcent(a_a, weights = V(a_a)$weight)$vector
# "Let’s calculate modularity using Leading Eigenvector communities.
# We store the number of the community each vertex belongs to in the vertex attribute “lec_community”.
# Then we can use this attribute in Gephi to color the nodes."
V(a_a)$lec_community <- as.character(leading.eigenvector.community(a_a)$membership)
E(a_a)$Weight <- E(a_a)$weight
# Gephi doesn't handle non-ASCII so some very rough and dirty cleaning below.
# Note that some visible names may need to be manually edited in the Gephi
# Data Laboratory view.
#
# EDIT 13.2.2013: seems that the culprit was my Linux installation, not Gephi.
# In Windows7 environment, UTF-8 works fine all the way through, so there is no need
# to venture into these ugly character modifications below.
V(a_a)$name <- sub("ö", "o", V(a_a)$name)
V(a_a)$name <- sub("ä", "a", V(a_a)$name)
V(a_a)$name <- sub("é", "e", V(a_a)$name)
V(a_a)$name <- iconv(V(a_a)$name, to = 'ASCII', sub = "")
V(a_a)$Label <- V(a_a)$name
f <- paste("coauthoring_aalto_", deparse(substitute(df)), ".graphml", sep = "")
write.graph(a_a, file = f, format = "graphml")
}
coauth(hari, c("author"), c("title"))
coauth(smeds, c("author"), c("title"))
coauth(karppinen, c("author"), c("title"))
coauth(varis, c("author"), c("title"))
coauth(hallikainen, c("author"), c("title"))
########################################
#
# Author statistics
#
########################################
library(plyr)
# Number of authors per publication
agg <- ddply(all, .(year, title), summarise, n = length(author))
# Mean number of authors (per publication) per year
agg2 <- ddply(agg, .(year), summarise, mean = round(mean(n), digits = 1))
agg2 <- agg2[!is.na(agg2$year), ]
######################
#
# Case professors
#
######################
hari.agg <- ddply(hari, .(year, title), summarise, n = length(author))
hari.agg2 <- ddply(hari.agg, .(year), summarise, mean = round(mean(n), digits = 1))
smeds.agg <- ddply(smeds, .(year, title), summarise, n = length(author))
smeds.agg2 <- ddply(smeds.agg, .(year), summarise, mean = round(mean(n), digits = 1))
karppinen.agg <- ddply(karppinen, .(year, title), summarise, n = length(author))
karppinen.agg2 <- ddply(karppinen.agg, .(year), summarise, mean = round(mean(n), digits = 1))
varis.agg <- ddply(varis, .(year, title), summarise, n = length(author))
varis.agg2 <- ddply(varis.agg, .(year), summarise, mean = round(mean(n), digits = 1))
hallikainen.agg <- ddply(hallikainen, .(year, title), summarise, n = length(author))
hallikainen.agg2 <- ddply(hallikainen.agg, .(year), summarise, mean = round(mean(n), digits = 1))
png("aalto.auth.publ.png", width = 1024, height = 768, res = 72)
plot(agg2,
main = "Mean number of authors per publication",
xlab = "Year",
ylab = "Authors",
col = "magenta",
ylim = c(1, 11),
type = "l",
lwd = 1,
lty = "dashed")
lines(x = hallikainen.agg2$year,
y = hallikainen.agg2$mean,
col = "red",
lwd = 2,
lty = "solid")
lines(x = hari.agg2$year,
y = hari.agg2$mean,
col = "blue",
lwd = 2,
lty = "solid")
lines(x = karppinen.agg2$year,
y = karppinen.agg2$mean,
col = "grey",
lwd = 2,
lty = "solid")
lines(x = smeds.agg2$year,
y = smeds.agg2$mean,
col = "darkolivegreen",
lwd = 2,
lty = "solid")
lines(x = varis.agg2$year,
y = varis.agg2$mean,
col = "black",
lwd = 2,
lty = "solid")
legend("topleft", inset = c(.05,.08),
c("Aalto Univ", "Hallikainen", "Hari", "Karppinen", "Smeds", "Varis"),
lwd = c(1, 2, 2, 2, 2, 2),
lty = c("dashed", "solid", "solid", "solid", "solid", "solid"),
col = c("magenta", "red", "blue", "grey", "darkolivegreen", "black"))
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment