tts/query.R

## query.R
# Query ACRIS on ORCID IDs. Based on these, fetch works and bio via ORCID Public API
#
# SQL query in Pure and save in CSV: SELECT firstname, lastname, orcid FROM PERSON WHERE orcid IS NOT NULL
# Change character set: iconv -f ISO-8859-1 -t UTF-8 orcidinpure.csv > orcidinpure_utf8.csv
#
# Tuija Sonkkila 7.7.2016

library(rorcid)


orcidinpure <- read.csv2("orcidinpure_utf8.csv", header = F, stringsAsFactors = F)
names(orcidinpure) <- c("first", "last", "orcid")

# Testing my own data
#
# out <- orcid_id(orcid = "0000-0002-6892-9305")
# as.POSIXct(out$`0000-0002-6892-9305`$`orcid-history`$`submission-date`$value/1000, origin="1970-01-01")
# "2013-08-26 16:47:05 EEST" > OK
#
# works <- works(orcid_id("0000-0002-6892-9305"))
# myworks <- works$data # to data frame

# All ORCID IDs as a vector
idsinpure <- orcidinpure$orcid

# Prepare data frame
worksinorcid <- data.frame(orcid= character(0), title= character(0), type = character(0), year= character(0), created= numeric(0), stringsAsFactors = F)

# Collect works
for (i in idsinpure) {
  this <- works(orcid_id(i))
  if ( this$data != 'None') {
    for (j in 1:nrow(this$data)) {
      thisworks <- data.frame(orcid= character(1), title= character(1), type = character(1), year= character(1), created= numeric(1), stringsAsFactors = F)
      thisworks$orcid <- attributes(this)$orcid
      thisworks$title <-this$data[j, "work-title.title.value"]
      thisworks$type <- this$data[j, "work-type"]
      thisworks$year <- this$data[j, "publication-date.year.value"]
      thisworks$created <- this$data[j, "created-date.value"]
      worksinorcid <- rbind(worksinorcid, thisworks)
    }
  }
}

# UNIX timestamp to date
worksinorcid$created <- as.POSIXct(worksinorcid$created/1000, origin="1970-01-01")

# Write to file
write.csv(worksinorcid, file ="worksinorcid.csv", row.names = F)

# Collect bio
bioinpure <- orcid_id(orcid = idsinpure, profile="bio")

bioinorcid <- data.frame(orcid= character(0), biocreated=character(0), first=character(0), last=character(0), stringsAsFactors = F)

for (i in 1:length(bioinpure)) {
  this <- names(bioinpure)[i]
  thisbio <- data.frame(orcid=character(1), biocreated=character(1), first=character(1), last=character(1), stringsAsFactors = F)
  thisbio$orcid <- this
  thisbio$biocreated <- bioinpure[[this]]$`orcid-history`$`submission-date`$value
  # Parsing info from ORCID for testing purposes
  thisbio$first <- bioinpure[[this]]$`orcid-bio`$`personal-details`$`given-names`$value
  thisbio$last <- ifelse(!is.null(bioinpure[[this]]$`orcid-bio`$`personal-details`$`family-name`),
                         bioinpure[[this]]$`orcid-bio`$`personal-details`$`family-name`$value,
                         "N/A") # one was w/o familyname
  bioinorcid <- rbind(bioinorcid, thisbio)
}

bioinorcid$biocreated <- as.POSIXct(bioinorcid$biocreated/1000, origin="1970-01-01")


# Join with works data
library(dplyr)
bioandworksinorcid <- left_join(bioinorcid, worksinorcid)

# Write to file
write.csv(bioandworksinorcid, file = "bioandworksinorcid.csv", row.names = F)
	# Query ACRIS on ORCID IDs. Based on these, fetch works and bio via ORCID Public API
	#
	# SQL query in Pure and save in CSV: SELECT firstname, lastname, orcid FROM PERSON WHERE orcid IS NOT NULL
	# Change character set: iconv -f ISO-8859-1 -t UTF-8 orcidinpure.csv > orcidinpure_utf8.csv
	#
	# Tuija Sonkkila 7.7.2016

	library(rorcid)


	orcidinpure <- read.csv2("orcidinpure_utf8.csv", header = F, stringsAsFactors = F)
	names(orcidinpure) <- c("first", "last", "orcid")

	# Testing my own data
	#
	# out <- orcid_id(orcid = "0000-0002-6892-9305")
	# as.POSIXct(out$`0000-0002-6892-9305`$`orcid-history`$`submission-date`$value/1000, origin="1970-01-01")
	# "2013-08-26 16:47:05 EEST" > OK
	#
	# works <- works(orcid_id("0000-0002-6892-9305"))
	# myworks <- works$data # to data frame

	# All ORCID IDs as a vector
	idsinpure <- orcidinpure$orcid

	# Prepare data frame
	worksinorcid <- data.frame(orcid= character(0), title= character(0), type = character(0), year= character(0), created= numeric(0), stringsAsFactors = F)

	# Collect works
	for (i in idsinpure) {
	this <- works(orcid_id(i))
	if ( this$data != 'None') {
	for (j in 1:nrow(this$data)) {
	thisworks <- data.frame(orcid= character(1), title= character(1), type = character(1), year= character(1), created= numeric(1), stringsAsFactors = F)
	thisworks$orcid <- attributes(this)$orcid
	thisworks$title <-this$data[j, "work-title.title.value"]
	thisworks$type <- this$data[j, "work-type"]
	thisworks$year <- this$data[j, "publication-date.year.value"]
	thisworks$created <- this$data[j, "created-date.value"]
	worksinorcid <- rbind(worksinorcid, thisworks)
	}
	}
	}

	# UNIX timestamp to date
	worksinorcid$created <- as.POSIXct(worksinorcid$created/1000, origin="1970-01-01")

	# Write to file
	write.csv(worksinorcid, file ="worksinorcid.csv", row.names = F)

	# Collect bio
	bioinpure <- orcid_id(orcid = idsinpure, profile="bio")

	bioinorcid <- data.frame(orcid= character(0), biocreated=character(0), first=character(0), last=character(0), stringsAsFactors = F)

	for (i in 1:length(bioinpure)) {
	this <- names(bioinpure)[i]
	thisbio <- data.frame(orcid=character(1), biocreated=character(1), first=character(1), last=character(1), stringsAsFactors = F)
	thisbio$orcid <- this
	thisbio$biocreated <- bioinpure[[this]]$`orcid-history`$`submission-date`$value
	# Parsing info from ORCID for testing purposes
	thisbio$first <- bioinpure[[this]]$`orcid-bio`$`personal-details`$`given-names`$value
	thisbio$last <- ifelse(!is.null(bioinpure[[this]]$`orcid-bio`$`personal-details`$`family-name`),
	bioinpure[[this]]$`orcid-bio`$`personal-details`$`family-name`$value,
	"N/A") # one was w/o familyname
	bioinorcid <- rbind(bioinorcid, thisbio)
	}

	bioinorcid$biocreated <- as.POSIXct(bioinorcid$biocreated/1000, origin="1970-01-01")


	# Join with works data
	library(dplyr)
	bioandworksinorcid <- left_join(bioinorcid, worksinorcid)

	# Write to file
	write.csv(bioandworksinorcid, file = "bioandworksinorcid.csv", row.names = F)