Last active
September 6, 2017 15:44
-
-
Save csjx/d10858024ebc53effa5349c2544fa234 to your computer and use it in GitHub Desktop.
An R script that queries the DataONE Node Registry and extracts a list of dates when each MN became operational
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Query the DataONE Node Registry to find dates that Member Nodes began operation | |
# The DataONE Node registry service returns results as XML | |
library(httr) | |
library(xml2) | |
# The base URL for the DataONE Coordinating Node | |
cn_base_url <- "https://cn.dataone.org/cn" | |
# The node registry endpoint | |
cn_node_service <- "/v2/node" | |
nodes_url <- paste(cn_base_url, cn_node_service, sep = "") | |
# Execute the query | |
request <- GET(nodes_url) | |
node_list <- read_xml(content(request, as = "text", encoding = "UTF-8")) | |
# Filter the XML node list: | |
# - for Member Nodes only (type = mn) | |
# - with identifiers that don't start with 'urn:node:mn' (D1 replica nodes) | |
# - with a CN_operational_date property | |
nodes <- xml_find_all(node_list, | |
"//node[@type='mn' and not(starts-with(identifier, 'urn:node:mn'))]/property[@key='CN_date_operational']") | |
# Construct a node date list of just the text values from the filtered XML | |
dates <- xml_text(nodes) | |
# FYI: the node_list XML above looks like this (some elements removed for brevity): | |
# <?xml version="1.0" encoding="UTF-8"?> | |
# <ns3:nodeList xmlns:ns2="http://ns.dataone.org/service/types/v1" | |
# xmlns:ns3="http://ns.dataone.org/service/types/v2.0"> | |
# <node replicate="false" synchronize="true" type="mn" state="up"> | |
# <identifier>urn:node:RW</identifier> | |
# <name>Research Workspace</name> | |
# <description> | |
# The Research Workspace is a web-based, scientific data management | |
# platform that allows researchers to store and share their data. | |
# </description> | |
# <baseURL>https://dataone.researchworkspace.com/mn</baseURL> | |
# <property key="CN_date_operational">2017-07-25T00:00:0.000Z</property> | |
# </node> | |
# ... | |
# </ns3:nodeList> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Query the DataONE Index service to find uploads by month | |
# The DataONE index server can return results in CSV format, | |
# but not for faceted (summarized) results, so use the jsonlite | |
# library to parse the result | |
library(jsonlite) | |
# The base URL for the DataONE Coordinating Node | |
cn_base_url <- "https://cn.dataone.org/cn" | |
# The Solr query service endpoint | |
cn_query_service <- "/v2/query/solr/?" | |
# Construct a query string that: | |
# - limits records to METADATA or DATA format types | |
# - excludes obsoleted records (i.e. previous versions) | |
# and then return the response with | |
# - no raw results (0 rows) | |
# - no query response header (less complex result) | |
# - JSON format | |
query <- paste( | |
"q=formatType:(METADATA+OR+DATA)", | |
"+-obsoletedBy:*", | |
"&rows=0", | |
"&omitHeader=true", | |
"&wt=json", | |
sep = "" | |
) | |
# Configure how we summarize the data: | |
# - turn faceting on | |
# - summarize by range using the dateUploaded field | |
# - start the range in October, 2001 (the earliest upload date) | |
# - end the range with the current date | |
# - bin the results by month (%2B is a plus sign) | |
# - include months with 0 uploads | |
# - don't limit the size of the results returned | |
facet_settings <- paste( | |
"&facet=true", | |
"&facet.range=dateUploaded", | |
"&facet.range.start=2001-10-01T00:00:00Z", | |
"&facet.range.end=NOW", | |
"&facet.range.gap=%2B1MONTH", | |
"&facet.missing=true", | |
"&facet.limit=-1", | |
sep = "" | |
) | |
# Concatenate the above variables to create a query URL | |
uploads_by_month_url <- paste(cn_base_url, cn_query_service, query, facet_settings, sep = "") | |
# Execute the query | |
response <- fromJSON(uploads_by_month_url) | |
# And drill down into the JSON data structure to get uploads by month | |
# This results in an array of (month followed by count) pairs | |
uploads_by_month <- response$facet_counts$facet_ranges$dateUploaded$counts | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment