Last active
December 12, 2015 07:38
-
-
Save tts/4737987 to your computer and use it in GitHub Desktop.
Coauthoring in Aalto University publications, case some professors.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################### | |
# | |
# Coauthoring in Aalto University publications, case professors. | |
# | |
# Exporting .graphml from R, visualizing in Gephi. | |
# | |
# Data: data.aalto.fi | |
# | |
# Tuija Sonkkila 11.2.2013 | |
# 12.6.2013 (added note about querying a local RDF file, downloaded from data.aalto.fi) | |
# | |
# Most of the code is adapted from | |
# http://linkedscience.org/tools/sparql-package-for-r/ | |
# sparql-package-for-r-gephi-movie-star-graph-visualization-tutorial/ | |
# | |
# Comments in quotes are from the above URL as well. I have just replaced | |
# actor->author and movie->title | |
# | |
# Plot parameter hints in R see e.g. | |
# http://rdatamining.wordpress.com/2012/05/17/ | |
# an-example-of-social-network-analysis-with-r-using-package-igraph/ | |
# | |
######################################################################## | |
library(SPARQL) | |
library(igraph) | |
library(network) | |
library(ergm) | |
endpoint <- "http://data.aalto.fi/sparql" | |
baseq <- "SELECT ?author ?title ?year | |
WHERE { | |
GRAPH <http://data.aalto.fi/id/publications/tkkjulkaisee/> { | |
?publication <http://purl.org/dc/terms/title> ?title ; | |
<http://purl.org/dc/terms/date> ?year ; | |
<http://purl.org/ontology/bibo/authorList> ?list . | |
?list <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> */ | |
<http://www.w3.org/1999/02/22-rdf-syntax-ns#first> ?member . | |
?member <http://xmlns.com/foaf/0.1/name> ?author . | |
" | |
########################################################################################## | |
# | |
# 12.6.2013 | |
# | |
# If you have downloaded all publication data as RDF/XML to the working directory | |
# (which makes sense because at the moment, the content is not updated), then: | |
# | |
# library(rrdf) | |
# | |
# m1 = load.rdf("publ.rdf") | |
# summarize.rdf(m1) | |
# #"Number of triples: 1076965" | |
# | |
# # This makes a matrix. The query is of course the same | |
# all <- sparql.rdf(m1, "SELECT ... ") | |
# | |
# alldf <- as.data.frame(all, stringsAsFactors=FALSE) | |
# | |
# and continue with this data frame as below with 'all' | |
# | |
########################################################################################### | |
# Querying in chunks so as not to overload the service | |
q1 <- paste(baseq, "FILTER (regex(?year, '19.*'))}}", sep = "") | |
q2 <- paste(baseq, "FILTER (regex(?year, '200[012345]'))}}", sep = "") | |
q3 <- paste(baseq, "FILTER (regex(?year, '200[6789]'))}}", sep = "") | |
q4 <- paste(baseq, "FILTER (regex(?year, '201[012]'))}}", sep = "") | |
all.19 <- SPARQL(url = endpoint, q1)$results | |
all.20a <- SPARQL(url = endpoint, q2)$results | |
all.20b <- SPARQL(url = endpoint, q3)$results | |
all.20c <- SPARQL(url = endpoint, q4)$results | |
all <- rbind(all.19, all.20a, all.20b, all.20c) | |
#################################### | |
# | |
# Selection of prominent professors | |
# from different fields of STM | |
# | |
#################################### | |
temp <- all[all$author == 'Hari, Riitta', ] | |
temp2 <- all[all$author == 'Smeds, Riitta', ] | |
temp3 <- all[all$author == 'Karppinen, Maarit', ] | |
temp4 <- all[all$author == 'Varis, Olli', ] | |
temp5 <- all[all$author == 'Hallikainen, Martti', ] | |
hari.u <- unique(temp[ ,c("title")]) # some duplicates | |
smeds.u <- unique(temp2[, c("title")]) | |
karppinen.u <- unique(temp3[, c("title")]) | |
varis.u <- unique(temp4[, c("title")]) | |
hallikainen.u <- unique(temp5[, c("title")]) | |
# All authors, titles, and years of publications where she/he is an author | |
hari <- all[all$title %in% hari.u, ] | |
smeds <- all[all$title %in% smeds.u, ] | |
karppinen <- all[all$title %in% karppinen.u, ] | |
varis <- all[all$title %in% varis.u, ] | |
hallikainen <- all[all$title %in% hallikainen.u, ] | |
########################################################### | |
# | |
# Function coauth | |
# | |
# Representing coauthor relations | |
# | |
########################################################### | |
coauth <- function(df, col1, col2) { | |
author <- col1 | |
title <- col2 | |
# "We want a matrix from authors to titles, containing | |
# 1 when a person is among the authors of the publication, and | |
# 0 whe he does not" | |
author_title_matrix <- as.matrix(ifelse(table(df$author, df$title) > 0, 1, 0)) | |
# "The first step is to construct a adjacency graph of coauthoring relations. | |
# We can accomplish this by taking the author-title matrix, | |
# transposing it to a title-author matrix and then multiplying it with itself. | |
# This way we can traverse from authors to titles and then from | |
# titles to authors again, producing author-author relations." | |
coauthor_matrix <- author_title_matrix %*% t(author_title_matrix) | |
# "We are not interested in reflexive links, because it is obvious | |
# that everybody coauthors with himself. | |
# So let’s remove the numbers on the diagonal of the author-author matrix." | |
# | |
# This is VERY memory-intensive with big matrixes - not this one though. | |
# If in trouble, remove all unnecessary objects, and run gc() | |
diag(coauthor_matrix) <- 0 | |
# "We are dealing with an adjacency (or transition) matrix, | |
# which is a 1-mode graph" | |
a_a <- graph.adjacency(coauthor_matrix,weighted = TRUE, mode = "undirected") | |
# "Computing some SNA properties, like betweenness, | |
# edge betweenness and eigenvector centrality (Google PageRank is a kind of eigenvector centrality), | |
# and assign them to the nodes with a suitably named attributes." | |
V(a_a)$betweenness <- betweenness(a_a, directed = FALSE, normalized = TRUE) | |
E(a_a)$betweenness <- edge.betweenness(a_a, directed = FALSE) | |
V(a_a)$centrality <- evcent(a_a, weights = V(a_a)$weight)$vector | |
# "Let’s calculate modularity using Leading Eigenvector communities. | |
# We store the number of the community each vertex belongs to in the vertex attribute “lec_community”. | |
# Then we can use this attribute in Gephi to color the nodes." | |
V(a_a)$lec_community <- as.character(leading.eigenvector.community(a_a)$membership) | |
E(a_a)$Weight <- E(a_a)$weight | |
# Gephi doesn't handle non-ASCII so some very rough and dirty cleaning below. | |
# Note that some visible names may need to be manually edited in the Gephi | |
# Data Laboratory view. | |
# | |
# EDIT 13.2.2013: seems that the culprit was my Linux installation, not Gephi. | |
# In Windows7 environment, UTF-8 works fine all the way through, so there is no need | |
# to venture into these ugly character modifications below. | |
V(a_a)$name <- sub("ö", "o", V(a_a)$name) | |
V(a_a)$name <- sub("ä", "a", V(a_a)$name) | |
V(a_a)$name <- sub("é", "e", V(a_a)$name) | |
V(a_a)$name <- iconv(V(a_a)$name, to = 'ASCII', sub = "") | |
V(a_a)$Label <- V(a_a)$name | |
f <- paste("coauthoring_aalto_", deparse(substitute(df)), ".graphml", sep = "") | |
write.graph(a_a, file = f, format = "graphml") | |
} | |
coauth(hari, c("author"), c("title")) | |
coauth(smeds, c("author"), c("title")) | |
coauth(karppinen, c("author"), c("title")) | |
coauth(varis, c("author"), c("title")) | |
coauth(hallikainen, c("author"), c("title")) | |
######################################## | |
# | |
# Author statistics | |
# | |
######################################## | |
library(plyr) | |
# Number of authors per publication | |
agg <- ddply(all, .(year, title), summarise, n = length(author)) | |
# Mean number of authors (per publication) per year | |
agg2 <- ddply(agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
agg2 <- agg2[!is.na(agg2$year), ] | |
###################### | |
# | |
# Case professors | |
# | |
###################### | |
hari.agg <- ddply(hari, .(year, title), summarise, n = length(author)) | |
hari.agg2 <- ddply(hari.agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
smeds.agg <- ddply(smeds, .(year, title), summarise, n = length(author)) | |
smeds.agg2 <- ddply(smeds.agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
karppinen.agg <- ddply(karppinen, .(year, title), summarise, n = length(author)) | |
karppinen.agg2 <- ddply(karppinen.agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
varis.agg <- ddply(varis, .(year, title), summarise, n = length(author)) | |
varis.agg2 <- ddply(varis.agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
hallikainen.agg <- ddply(hallikainen, .(year, title), summarise, n = length(author)) | |
hallikainen.agg2 <- ddply(hallikainen.agg, .(year), summarise, mean = round(mean(n), digits = 1)) | |
png("aalto.auth.publ.png", width = 1024, height = 768, res = 72) | |
plot(agg2, | |
main = "Mean number of authors per publication", | |
xlab = "Year", | |
ylab = "Authors", | |
col = "magenta", | |
ylim = c(1, 11), | |
type = "l", | |
lwd = 1, | |
lty = "dashed") | |
lines(x = hallikainen.agg2$year, | |
y = hallikainen.agg2$mean, | |
col = "red", | |
lwd = 2, | |
lty = "solid") | |
lines(x = hari.agg2$year, | |
y = hari.agg2$mean, | |
col = "blue", | |
lwd = 2, | |
lty = "solid") | |
lines(x = karppinen.agg2$year, | |
y = karppinen.agg2$mean, | |
col = "grey", | |
lwd = 2, | |
lty = "solid") | |
lines(x = smeds.agg2$year, | |
y = smeds.agg2$mean, | |
col = "darkolivegreen", | |
lwd = 2, | |
lty = "solid") | |
lines(x = varis.agg2$year, | |
y = varis.agg2$mean, | |
col = "black", | |
lwd = 2, | |
lty = "solid") | |
legend("topleft", inset = c(.05,.08), | |
c("Aalto Univ", "Hallikainen", "Hari", "Karppinen", "Smeds", "Varis"), | |
lwd = c(1, 2, 2, 2, 2, 2), | |
lty = c("dashed", "solid", "solid", "solid", "solid", "solid"), | |
col = c("magenta", "red", "blue", "grey", "darkolivegreen", "black")) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment