DASpringate/web_mining_example.R

## web_mining_example.R
require(XML)
require(RCurl)
require(stringr)
require(rentrez)
require(rjson)
require(reshape2)
require(ggmap)
require(mapproj)
require(devtools)

install_github("rOpenHealth/rpubmed")
require(rpubmed)

## Get IDs of papers containing "electronic medical records"
paper_ids <- entrez_search("pubmed", "electronic medical records", retmax = 5000)$ids

records <- fetch_in_chunks(paper_ids)

## Find article locations in the nested records

addresses <- as.character(sapply(records, function(x) x$MedlineCitation$Article$AuthorList$Author$Affiliation))
# clean up with regex:
addresses <- str_replace_all(addresses, "(Electronic [Aa]ddress:)|[[:alnum:][:punct:]]+@+[[:alnum:][:punct:]]+", "")
journals <- as.character(sapply(records, function(x) x$MedlineCitation$MedlineJournalInfo$ISSNLinking))


job_data <- data.frame(address = addresses, ISSN = journals)


# Get data on impact factors of major scientific journals
impact_url <- "http://www.citefactor.org/impact-factor-list-2012.html"

# The data we want is in an HTML table on the webpage
impacts <- readHTMLTable(impact_url)[[1]]
impacts <- impacts[, c(1,2,4)] # keep only these columns

## article data and impact factor share the ISSN column so merge:
job_data <- merge(job_data, impacts, all.x = TRUE)

job_data <- job_data[complete.cases(job_data),]
job_data <- job_data[job_data$address != "NULL",]

save(job_data, file = "data/job_data.RData")

## This website provides ranking data for the top 100 European universities in a table:
rankings_url <- "http://www.researchranking.org/?action=ranking"

# No header in the table so we will make our own:
rankings <- readHTMLTable(rankings_url, header = FALSE, stringsAsFactors = FALSE)[[1]]
names(rankings)  <-  c("rank", "institution", "type", "country", "score")
rankings$institution <- str_replace(rankings$institution, "^THE ", "") # cleanup

## Iterate over the top 100 institutions, check if they are in the article address and include ranking data
job_data$institution_score <- NA
job_data$institutions <- NA
job_data$countries <- NA
for(institution in 1:nrow(rankings)){
    new_score <- str_detect(job_data$address, ignore.case(rankings[institution, "institution"]))
    if(sum(new_score)){
        job_data$institution_score[new_score] <- rankings[institution, "score"]
        job_data$institutions[new_score] <- rankings[institution, "institution"]
        job_data$countries[new_score] <- rankings[institution, "country"]
    }
}


save(job_data, file = "data/job_data.RData")

## Subset by universities with rankings:
selected <- job_data[!is.na(job_data$institution_score),]
selected$impact_factor <- as.numeric(as.character(selected[["Impact Factor"]]))
selected[["Impact Factor"]] <- NULL
selected$institution_score <- as.numeric(selected$institution_score)

## Use ggmap to get longitudes and latitudes:
coords <- geocode(paste(selected$institution, selected$country))

# Bind on to the side of the article data
selected <- cbind(selected, coords)
save(selected, file = "selected.RData")

## Use reshape2 to aggragate our data to 1 line for each institution
molten <- melt(selected)

# Get mean impact factor for articles from a university:
my_data <- dcast(molten, country + institution ~ variable, mean)
# Get publication counts for each university:
publications <- dcast(molten, country + institution ~ variable, length)
my_data$publications <- publications$institution_score
# composite ranking
my_data$ranks <- rank(my_data$institution_score) + rank(my_data$impact_factor) + rank(my_data$publications)
my_data <- my_data[order(my_data$ranks, decreasing = TRUE),]

# Capitalise the university names for display
capitalise <- function(x) {
    s <- tolower(strsplit(x, " ")[[1]])
    paste(toupper(substring(s, 1,1)), substring(s, 2),
          sep="", collapse=" ")
}

my_data$institution <- sapply(my_data$institution, capitalise)
my_data$labels <- my_data$institution
my_data$labels[my_data$ranks < 35] <- ""

# Plot the map:
p <- qmap(c(-5, 60, 19, 48), zoom = 4)
p + geom_point(aes(lon, lat, colour = ranks, size = publications), data = my_data) +
    scale_color_continuous( low="red", high="blue") +
    geom_text(aes(x = lon, y = lat, label = labels, size = 6,
                  hjust = c(1,rep(0, 5), 1, 0,0,1,rep(0,11)), vjust = 0), data = my_data)
	require(XML)
	require(RCurl)
	require(stringr)
	require(rentrez)
	require(rjson)
	require(reshape2)
	require(ggmap)
	require(mapproj)
	require(devtools)

	install_github("rOpenHealth/rpubmed")
	require(rpubmed)

	## Get IDs of papers containing "electronic medical records"
	paper_ids <- entrez_search("pubmed", "electronic medical records", retmax = 5000)$ids

	records <- fetch_in_chunks(paper_ids)

	## Find article locations in the nested records

	addresses <- as.character(sapply(records, function(x) x$MedlineCitation$Article$AuthorList$Author$Affiliation))
	# clean up with regex:
	addresses <- str_replace_all(addresses, "(Electronic [Aa]ddress:)\|[[:alnum:][:punct:]]+@+[[:alnum:][:punct:]]+", "")
	journals <- as.character(sapply(records, function(x) x$MedlineCitation$MedlineJournalInfo$ISSNLinking))


	job_data <- data.frame(address = addresses, ISSN = journals)


	# Get data on impact factors of major scientific journals
	impact_url <- "http://www.citefactor.org/impact-factor-list-2012.html"

	# The data we want is in an HTML table on the webpage
	impacts <- readHTMLTable(impact_url)[[1]]
	impacts <- impacts[, c(1,2,4)] # keep only these columns

	## article data and impact factor share the ISSN column so merge:
	job_data <- merge(job_data, impacts, all.x = TRUE)

	job_data <- job_data[complete.cases(job_data),]
	job_data <- job_data[job_data$address != "NULL",]

	save(job_data, file = "data/job_data.RData")

	## This website provides ranking data for the top 100 European universities in a table:
	rankings_url <- "http://www.researchranking.org/?action=ranking"

	# No header in the table so we will make our own:
	rankings <- readHTMLTable(rankings_url, header = FALSE, stringsAsFactors = FALSE)[[1]]
	names(rankings) <- c("rank", "institution", "type", "country", "score")
	rankings$institution <- str_replace(rankings$institution, "^THE ", "") # cleanup

	## Iterate over the top 100 institutions, check if they are in the article address and include ranking data
	job_data$institution_score <- NA
	job_data$institutions <- NA
	job_data$countries <- NA
	for(institution in 1:nrow(rankings)){
	new_score <- str_detect(job_data$address, ignore.case(rankings[institution, "institution"]))
	if(sum(new_score)){
	job_data$institution_score[new_score] <- rankings[institution, "score"]
	job_data$institutions[new_score] <- rankings[institution, "institution"]
	job_data$countries[new_score] <- rankings[institution, "country"]
	}
	}


	save(job_data, file = "data/job_data.RData")

	## Subset by universities with rankings:
	selected <- job_data[!is.na(job_data$institution_score),]
	selected$impact_factor <- as.numeric(as.character(selected[["Impact Factor"]]))
	selected[["Impact Factor"]] <- NULL
	selected$institution_score <- as.numeric(selected$institution_score)

	## Use ggmap to get longitudes and latitudes:
	coords <- geocode(paste(selected$institution, selected$country))

	# Bind on to the side of the article data
	selected <- cbind(selected, coords)
	save(selected, file = "selected.RData")

	## Use reshape2 to aggragate our data to 1 line for each institution
	molten <- melt(selected)

	# Get mean impact factor for articles from a university:
	my_data <- dcast(molten, country + institution ~ variable, mean)
	# Get publication counts for each university:
	publications <- dcast(molten, country + institution ~ variable, length)
	my_data$publications <- publications$institution_score
	# composite ranking
	my_data$ranks <- rank(my_data$institution_score) + rank(my_data$impact_factor) + rank(my_data$publications)
	my_data <- my_data[order(my_data$ranks, decreasing = TRUE),]

	# Capitalise the university names for display
	capitalise <- function(x) {
	s <- tolower(strsplit(x, " ")[[1]])
	paste(toupper(substring(s, 1,1)), substring(s, 2),
	sep="", collapse=" ")
	}

	my_data$institution <- sapply(my_data$institution, capitalise)
	my_data$labels <- my_data$institution
	my_data$labels[my_data$ranks < 35] <- ""

	# Plot the map:
	p <- qmap(c(-5, 60, 19, 48), zoom = 4)
	p + geom_point(aes(lon, lat, colour = ranks, size = publications), data = my_data) +
	scale_color_continuous( low="red", high="blue") +
	geom_text(aes(x = lon, y = lat, label = labels, size = 6,
	hjust = c(1,rep(0, 5), 1, 0,0,1,rep(0,11)), vjust = 0), data = my_data)