Last active
August 29, 2015 14:16
-
-
Save tts/278b1f1fbebede584ada to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tuija Sonkkila 3.3.2015 | |
# | |
# Aalto University articles cited in Wikipedia with the DOI. | |
# This code makes node and node attribute files for a network graph visualization. | |
# | |
# Wikipedia data CC0: | |
# Halfaker, Aaron; Taraborelli, Dario (2015): Scholarly article citations in Wikipedia. figshare. | |
# http://dx.doi.org/10.6084/m9.figshare.1299540 | |
# Retrieved 15:17, Mar 02, 2015 (GMT) | |
# | |
# Web of Science data by Thomson Reuters | |
# | |
# More on blog post https://blogs.aalto.fi/suoritin/2015/03/03/wikipedia-outreach-by-field-of-science/ | |
library(dplyr) | |
library(tidyr) | |
wikidatadoi <- read.delim("doi_and_pubmed_citations.enwiki_20150112.tsv", | |
quote="", | |
stringsAsFactors=FALSE) | |
# aalto.doi gathered from local sources, originating to Web of Science | |
aalto.doi <- aalto.doi %>% | |
mutate(id=di) %>% | |
select(id,tc,ti,wc) | |
joined <- inner_join(wikidatadoi, aalto.doi, by = c("id" = "id")) | |
# Extract first field of science out of possibly many | |
joined.kw.split <- joined %>% | |
extract("wc", "kw", "([^;]+)\\;*") | |
# Aggregate fields to WoS research areas. | |
# wosaggr.csv made partly manually out of | |
# http://images.webofknowledge.com/WOKRS57B4/help/WOS/hp_research_areas_easca.html | |
wosareas <- read.table("wosaggr.csv", header=FALSE, sep = ";", quote = "", stringsAsFactors = FALSE) | |
names(wosareas) <- c("field", "agg") | |
wosareas$field <- tolower(wosareas$field) | |
# http://r.789695.n4.nabble.com/how-to-transform-string-to-Camel-Case-tp4664222p4664261.html | |
wosareas$field <- gsub("(^|\\s+)([a-z])","\\1\\U\\2",wosareas$field, perl=TRUE) | |
all <- left_join(joined.kw.split, wosareas, by=c("kw" = "field")) | |
by.field <- all %>% | |
group_by(kw) %>% | |
mutate(Count = n(), | |
WoSCitesAvg = round(sum(tc)/n(), digits=2), | |
DOI = id) %>% | |
filter(! duplicated(kw)) %>% | |
select(kw, agg, Count, WoSCitesAvg) | |
# Make nodes | |
nodes <- by.field %>% | |
ungroup() %>% | |
mutate(Label = kw) %>% | |
mutate(Id = seq(1, nrow(by.field), by = 1), | |
Type = "Undirected") %>% | |
select(Label, Id, Type) | |
write.csv(nodes, file="nodes.csv", row.names=FALSE) | |
# Make node attributes | |
nodes.attr <- by.field %>% | |
ungroup() %>% | |
mutate(Id = seq(1, nrow(by.field), by = 1), | |
Field = kw) %>% | |
select(Count, Field, agg, WoSCitesAvg, Id) | |
# Add US state coordinates to categories | |
download.file("http://econym.org.uk/gmap/states.xml", destfile = "states.xml", method="curl") | |
command <- "java -jar ~/saxonee/saxon9ee.jar states.xml parsexml.xsl >states.csv" | |
system(command) | |
statecoord <- read.table("states.csv", header = T, sep = ";", stringsAsFactors = F) | |
nodes.attr$state <- sapply(nodes.attr$agg, function(x) { | |
if (x == 'Life Sciences & Biomedicine') "Washington" | |
else if (x == 'Physical Sciences') "Florida" | |
else if (x == 'Technology') "Alaska" | |
else if (x == 'Arts & Humanities') "North Dakota" | |
else "Maine" | |
}) | |
washington <- statecoord[statecoord$state=='Washington',] | |
florida <- statecoord[statecoord$state=='Florida',] | |
maine <- statecoord[statecoord$state=='Maine',] | |
northdakota <- statecoord[statecoord$state=='North Dakota',] | |
alaska <- statecoord[statecoord$state == 'Alaska',] | |
nodes.attr$latitude <- "" | |
nodes.attr$longitude <- "" | |
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Washington",]) ) { | |
nodes.attr[nodes.attr$state=="Washington", ]$latitude[i] <- washington$lat[i] | |
nodes.attr[nodes.attr$state=="Washington", ]$longitude[i] <- washington$lon[i] | |
} | |
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Florida",]) ) { | |
nodes.attr[nodes.attr$state=="Florida", ]$latitude[i] <- florida$lat[i] | |
nodes.attr[nodes.attr$state=="Florida", ]$longitude[i] <- florida$lon[i] | |
} | |
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Maine",]) ) { | |
nodes.attr[nodes.attr$state=="Maine", ]$latitude[i] <- maine$lat[i] | |
nodes.attr[nodes.attr$state=="Maine", ]$longitude[i] <- maine$lon[i] | |
} | |
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="North Dakota",]) ) { | |
nodes.attr[nodes.attr$state=="North Dakota", ]$latitude[i] <- northdakota$lat[i] | |
nodes.attr[nodes.attr$state=="North Dakota", ]$longitude[i] <- northdakota$lon[i] | |
} | |
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Alaska",]) ) { | |
nodes.attr[nodes.attr$state=="Alaska", ]$latitude[i] <- alaska$lat[i] | |
nodes.attr[nodes.attr$state=="Alaska", ]$longitude[i] <- alaska$lon[i] | |
} | |
nodes.attr$Color <- sapply(nodes.attr$WoSCitesAvg, function(x) { | |
if (x <= 10) "#a50026" | |
else if (x > 10 && x <= 50) "#fdae61" | |
else if (x > 50 && x <= 100) "#ffffbf" | |
else if (x > 100 && x <= 200) "#a6d96a" | |
else "#006837" | |
}) | |
nodes.attr <- nodes.attr %>% | |
mutate(Latitude = latitude, Longitude = longitude) %>% | |
select(Id, Count, WoSCitesAvg, Latitude, Longitude, Color) | |
write.csv(nodes.attr, file="nodes_attr.csv", row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment