Skip to content

Instantly share code, notes, and snippets.

@tts
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tts/278b1f1fbebede584ada to your computer and use it in GitHub Desktop.
Save tts/278b1f1fbebede584ada to your computer and use it in GitHub Desktop.
# Tuija Sonkkila 3.3.2015
#
# Aalto University articles cited in Wikipedia with the DOI.
# This code makes node and node attribute files for a network graph visualization.
#
# Wikipedia data CC0:
# Halfaker, Aaron; Taraborelli, Dario (2015): Scholarly article citations in Wikipedia. figshare.
# http://dx.doi.org/10.6084/m9.figshare.1299540
# Retrieved 15:17, Mar 02, 2015 (GMT)
#
# Web of Science data by Thomson Reuters
#
# More on blog post https://blogs.aalto.fi/suoritin/2015/03/03/wikipedia-outreach-by-field-of-science/
library(dplyr)
library(tidyr)
wikidatadoi <- read.delim("doi_and_pubmed_citations.enwiki_20150112.tsv",
quote="",
stringsAsFactors=FALSE)
# aalto.doi gathered from local sources, originating to Web of Science
aalto.doi <- aalto.doi %>%
mutate(id=di) %>%
select(id,tc,ti,wc)
joined <- inner_join(wikidatadoi, aalto.doi, by = c("id" = "id"))
# Extract first field of science out of possibly many
joined.kw.split <- joined %>%
extract("wc", "kw", "([^;]+)\\;*")
# Aggregate fields to WoS research areas.
# wosaggr.csv made partly manually out of
# http://images.webofknowledge.com/WOKRS57B4/help/WOS/hp_research_areas_easca.html
wosareas <- read.table("wosaggr.csv", header=FALSE, sep = ";", quote = "", stringsAsFactors = FALSE)
names(wosareas) <- c("field", "agg")
wosareas$field <- tolower(wosareas$field)
# http://r.789695.n4.nabble.com/how-to-transform-string-to-Camel-Case-tp4664222p4664261.html
wosareas$field <- gsub("(^|\\s+)([a-z])","\\1\\U\\2",wosareas$field, perl=TRUE)
all <- left_join(joined.kw.split, wosareas, by=c("kw" = "field"))
by.field <- all %>%
group_by(kw) %>%
mutate(Count = n(),
WoSCitesAvg = round(sum(tc)/n(), digits=2),
DOI = id) %>%
filter(! duplicated(kw)) %>%
select(kw, agg, Count, WoSCitesAvg)
# Make nodes
nodes <- by.field %>%
ungroup() %>%
mutate(Label = kw) %>%
mutate(Id = seq(1, nrow(by.field), by = 1),
Type = "Undirected") %>%
select(Label, Id, Type)
write.csv(nodes, file="nodes.csv", row.names=FALSE)
# Make node attributes
nodes.attr <- by.field %>%
ungroup() %>%
mutate(Id = seq(1, nrow(by.field), by = 1),
Field = kw) %>%
select(Count, Field, agg, WoSCitesAvg, Id)
# Add US state coordinates to categories
download.file("http://econym.org.uk/gmap/states.xml", destfile = "states.xml", method="curl")
command <- "java -jar ~/saxonee/saxon9ee.jar states.xml parsexml.xsl >states.csv"
system(command)
statecoord <- read.table("states.csv", header = T, sep = ";", stringsAsFactors = F)
nodes.attr$state <- sapply(nodes.attr$agg, function(x) {
if (x == 'Life Sciences & Biomedicine') "Washington"
else if (x == 'Physical Sciences') "Florida"
else if (x == 'Technology') "Alaska"
else if (x == 'Arts & Humanities') "North Dakota"
else "Maine"
})
washington <- statecoord[statecoord$state=='Washington',]
florida <- statecoord[statecoord$state=='Florida',]
maine <- statecoord[statecoord$state=='Maine',]
northdakota <- statecoord[statecoord$state=='North Dakota',]
alaska <- statecoord[statecoord$state == 'Alaska',]
nodes.attr$latitude <- ""
nodes.attr$longitude <- ""
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Washington",]) ) {
nodes.attr[nodes.attr$state=="Washington", ]$latitude[i] <- washington$lat[i]
nodes.attr[nodes.attr$state=="Washington", ]$longitude[i] <- washington$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Florida",]) ) {
nodes.attr[nodes.attr$state=="Florida", ]$latitude[i] <- florida$lat[i]
nodes.attr[nodes.attr$state=="Florida", ]$longitude[i] <- florida$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Maine",]) ) {
nodes.attr[nodes.attr$state=="Maine", ]$latitude[i] <- maine$lat[i]
nodes.attr[nodes.attr$state=="Maine", ]$longitude[i] <- maine$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="North Dakota",]) ) {
nodes.attr[nodes.attr$state=="North Dakota", ]$latitude[i] <- northdakota$lat[i]
nodes.attr[nodes.attr$state=="North Dakota", ]$longitude[i] <- northdakota$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Alaska",]) ) {
nodes.attr[nodes.attr$state=="Alaska", ]$latitude[i] <- alaska$lat[i]
nodes.attr[nodes.attr$state=="Alaska", ]$longitude[i] <- alaska$lon[i]
}
nodes.attr$Color <- sapply(nodes.attr$WoSCitesAvg, function(x) {
if (x <= 10) "#a50026"
else if (x > 10 && x <= 50) "#fdae61"
else if (x > 50 && x <= 100) "#ffffbf"
else if (x > 100 && x <= 200) "#a6d96a"
else "#006837"
})
nodes.attr <- nodes.attr %>%
mutate(Latitude = latitude, Longitude = longitude) %>%
select(Id, Count, WoSCitesAvg, Latitude, Longitude, Color)
write.csv(nodes.attr, file="nodes_attr.csv", row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment