tts/aaltowp.R

## aaltowp.R
# Tuija Sonkkila 3.3.2015
#
# Aalto University articles cited in Wikipedia with the DOI.
# This code makes node and node attribute files for a network graph visualization.
#
# Wikipedia data CC0:
# Halfaker, Aaron; Taraborelli, Dario (2015): Scholarly article citations in Wikipedia. figshare.
# http://dx.doi.org/10.6084/m9.figshare.1299540
# Retrieved 15:17, Mar 02, 2015 (GMT)
#
# Web of Science data by Thomson Reuters
#
# More on blog post https://blogs.aalto.fi/suoritin/2015/03/03/wikipedia-outreach-by-field-of-science/

library(dplyr)
library(tidyr)

wikidatadoi <- read.delim("doi_and_pubmed_citations.enwiki_20150112.tsv",
                          quote="",
                          stringsAsFactors=FALSE)

# aalto.doi gathered from local sources, originating to Web of Science
aalto.doi <- aalto.doi %>%
  mutate(id=di) %>%
  select(id,tc,ti,wc)

joined <- inner_join(wikidatadoi, aalto.doi, by = c("id" = "id"))

# Extract first field of science out of possibly many
joined.kw.split <- joined %>%
  extract("wc", "kw", "([^;]+)\\;*")

# Aggregate fields to WoS research areas.
# wosaggr.csv made partly manually out of
# http://images.webofknowledge.com/WOKRS57B4/help/WOS/hp_research_areas_easca.html
wosareas <- read.table("wosaggr.csv", header=FALSE, sep = ";", quote = "", stringsAsFactors = FALSE)
names(wosareas) <- c("field", "agg")
wosareas$field <- tolower(wosareas$field)
# http://r.789695.n4.nabble.com/how-to-transform-string-to-Camel-Case-tp4664222p4664261.html
wosareas$field <- gsub("(^|\\s+)([a-z])","\\1\\U\\2",wosareas$field, perl=TRUE)
all <- left_join(joined.kw.split, wosareas, by=c("kw" = "field"))

by.field <- all %>%
  group_by(kw) %>%
  mutate(Count = n(),
         WoSCitesAvg = round(sum(tc)/n(), digits=2),
         DOI = id) %>%
  filter(! duplicated(kw)) %>%
  select(kw, agg, Count, WoSCitesAvg)

# Make nodes
nodes <- by.field %>%
  ungroup() %>%
  mutate(Label = kw) %>%
  mutate(Id = seq(1, nrow(by.field), by = 1),
         Type = "Undirected") %>%
  select(Label, Id, Type)

write.csv(nodes, file="nodes.csv", row.names=FALSE)

# Make node attributes
nodes.attr <- by.field %>%
  ungroup() %>%
  mutate(Id = seq(1, nrow(by.field), by = 1),
         Field = kw) %>%
  select(Count, Field, agg, WoSCitesAvg, Id)

# Add US state coordinates to categories
download.file("http://econym.org.uk/gmap/states.xml", destfile = "states.xml", method="curl")
command <- "java -jar ~/saxonee/saxon9ee.jar states.xml parsexml.xsl >states.csv"
system(command)

statecoord <- read.table("states.csv", header = T, sep = ";", stringsAsFactors = F)
nodes.attr$state <- sapply(nodes.attr$agg, function(x) {
  if (x == 'Life Sciences & Biomedicine') "Washington"
  else if (x == 'Physical Sciences') "Florida"
  else if (x == 'Technology') "Alaska"
  else if (x == 'Arts & Humanities') "North Dakota"
  else "Maine"
})

washington <- statecoord[statecoord$state=='Washington',]
florida <- statecoord[statecoord$state=='Florida',]
maine <- statecoord[statecoord$state=='Maine',]
northdakota <- statecoord[statecoord$state=='North Dakota',]
alaska <- statecoord[statecoord$state == 'Alaska',]

nodes.attr$latitude <- ""
nodes.attr$longitude <- ""

for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Washington",]) ) {
  nodes.attr[nodes.attr$state=="Washington", ]$latitude[i] <- washington$lat[i]
  nodes.attr[nodes.attr$state=="Washington", ]$longitude[i] <- washington$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Florida",]) ) {
  nodes.attr[nodes.attr$state=="Florida", ]$latitude[i] <- florida$lat[i]
  nodes.attr[nodes.attr$state=="Florida", ]$longitude[i] <- florida$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Maine",]) ) {
  nodes.attr[nodes.attr$state=="Maine", ]$latitude[i] <- maine$lat[i]
  nodes.attr[nodes.attr$state=="Maine", ]$longitude[i] <- maine$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="North Dakota",]) ) {
  nodes.attr[nodes.attr$state=="North Dakota", ]$latitude[i] <- northdakota$lat[i]
  nodes.attr[nodes.attr$state=="North Dakota", ]$longitude[i] <- northdakota$lon[i]
}
for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Alaska",]) ) {
  nodes.attr[nodes.attr$state=="Alaska", ]$latitude[i] <- alaska$lat[i]
  nodes.attr[nodes.attr$state=="Alaska", ]$longitude[i] <- alaska$lon[i]
}

nodes.attr$Color <- sapply(nodes.attr$WoSCitesAvg, function(x) {
  if (x <= 10) "#a50026"
  else if (x > 10 && x <= 50) "#fdae61"
  else if (x > 50 && x <= 100) "#ffffbf"
  else if (x > 100 && x <= 200) "#a6d96a"
  else "#006837"
})

nodes.attr <- nodes.attr %>%
  mutate(Latitude = latitude, Longitude = longitude) %>%
  select(Id, Count, WoSCitesAvg, Latitude, Longitude, Color)

write.csv(nodes.attr, file="nodes_attr.csv", row.names=FALSE)
	# Tuija Sonkkila 3.3.2015
	#
	# Aalto University articles cited in Wikipedia with the DOI.
	# This code makes node and node attribute files for a network graph visualization.
	#
	# Wikipedia data CC0:
	# Halfaker, Aaron; Taraborelli, Dario (2015): Scholarly article citations in Wikipedia. figshare.
	# http://dx.doi.org/10.6084/m9.figshare.1299540
	# Retrieved 15:17, Mar 02, 2015 (GMT)
	#
	# Web of Science data by Thomson Reuters
	#
	# More on blog post https://blogs.aalto.fi/suoritin/2015/03/03/wikipedia-outreach-by-field-of-science/

	library(dplyr)
	library(tidyr)

	wikidatadoi <- read.delim("doi_and_pubmed_citations.enwiki_20150112.tsv",
	quote="",
	stringsAsFactors=FALSE)

	# aalto.doi gathered from local sources, originating to Web of Science
	aalto.doi <- aalto.doi %>%
	mutate(id=di) %>%
	select(id,tc,ti,wc)

	joined <- inner_join(wikidatadoi, aalto.doi, by = c("id" = "id"))

	# Extract first field of science out of possibly many
	joined.kw.split <- joined %>%
	extract("wc", "kw", "([^;]+)\\;*")

	# Aggregate fields to WoS research areas.
	# wosaggr.csv made partly manually out of
	# http://images.webofknowledge.com/WOKRS57B4/help/WOS/hp_research_areas_easca.html
	wosareas <- read.table("wosaggr.csv", header=FALSE, sep = ";", quote = "", stringsAsFactors = FALSE)
	names(wosareas) <- c("field", "agg")
	wosareas$field <- tolower(wosareas$field)
	# http://r.789695.n4.nabble.com/how-to-transform-string-to-Camel-Case-tp4664222p4664261.html
	wosareas$field <- gsub("(^\|\\s+)([a-z])","\\1\\U\\2",wosareas$field, perl=TRUE)
	all <- left_join(joined.kw.split, wosareas, by=c("kw" = "field"))

	by.field <- all %>%
	group_by(kw) %>%
	mutate(Count = n(),
	WoSCitesAvg = round(sum(tc)/n(), digits=2),
	DOI = id) %>%
	filter(! duplicated(kw)) %>%
	select(kw, agg, Count, WoSCitesAvg)

	# Make nodes
	nodes <- by.field %>%
	ungroup() %>%
	mutate(Label = kw) %>%
	mutate(Id = seq(1, nrow(by.field), by = 1),
	Type = "Undirected") %>%
	select(Label, Id, Type)

	write.csv(nodes, file="nodes.csv", row.names=FALSE)

	# Make node attributes
	nodes.attr <- by.field %>%
	ungroup() %>%
	mutate(Id = seq(1, nrow(by.field), by = 1),
	Field = kw) %>%
	select(Count, Field, agg, WoSCitesAvg, Id)

	# Add US state coordinates to categories
	download.file("http://econym.org.uk/gmap/states.xml", destfile = "states.xml", method="curl")
	command <- "java -jar ~/saxonee/saxon9ee.jar states.xml parsexml.xsl >states.csv"
	system(command)

	statecoord <- read.table("states.csv", header = T, sep = ";", stringsAsFactors = F)
	nodes.attr$state <- sapply(nodes.attr$agg, function(x) {
	if (x == 'Life Sciences & Biomedicine') "Washington"
	else if (x == 'Physical Sciences') "Florida"
	else if (x == 'Technology') "Alaska"
	else if (x == 'Arts & Humanities') "North Dakota"
	else "Maine"
	})

	washington <- statecoord[statecoord$state=='Washington',]
	florida <- statecoord[statecoord$state=='Florida',]
	maine <- statecoord[statecoord$state=='Maine',]
	northdakota <- statecoord[statecoord$state=='North Dakota',]
	alaska <- statecoord[statecoord$state == 'Alaska',]

	nodes.attr$latitude <- ""
	nodes.attr$longitude <- ""

	for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Washington",]) ) {
	nodes.attr[nodes.attr$state=="Washington", ]$latitude[i] <- washington$lat[i]
	nodes.attr[nodes.attr$state=="Washington", ]$longitude[i] <- washington$lon[i]
	}
	for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Florida",]) ) {
	nodes.attr[nodes.attr$state=="Florida", ]$latitude[i] <- florida$lat[i]
	nodes.attr[nodes.attr$state=="Florida", ]$longitude[i] <- florida$lon[i]
	}
	for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Maine",]) ) {
	nodes.attr[nodes.attr$state=="Maine", ]$latitude[i] <- maine$lat[i]
	nodes.attr[nodes.attr$state=="Maine", ]$longitude[i] <- maine$lon[i]
	}
	for ( i in 1:nrow(nodes.attr[nodes.attr$state=="North Dakota",]) ) {
	nodes.attr[nodes.attr$state=="North Dakota", ]$latitude[i] <- northdakota$lat[i]
	nodes.attr[nodes.attr$state=="North Dakota", ]$longitude[i] <- northdakota$lon[i]
	}
	for ( i in 1:nrow(nodes.attr[nodes.attr$state=="Alaska",]) ) {
	nodes.attr[nodes.attr$state=="Alaska", ]$latitude[i] <- alaska$lat[i]
	nodes.attr[nodes.attr$state=="Alaska", ]$longitude[i] <- alaska$lon[i]
	}

	nodes.attr$Color <- sapply(nodes.attr$WoSCitesAvg, function(x) {
	if (x <= 10) "#a50026"
	else if (x > 10 && x <= 50) "#fdae61"
	else if (x > 50 && x <= 100) "#ffffbf"
	else if (x > 100 && x <= 200) "#a6d96a"
	else "#006837"
	})

	nodes.attr <- nodes.attr %>%
	mutate(Latitude = latitude, Longitude = longitude) %>%
	select(Id, Count, WoSCitesAvg, Latitude, Longitude, Color)

	write.csv(nodes.attr, file="nodes_attr.csv", row.names=FALSE)