Skip to content

Instantly share code, notes, and snippets.

@csjx
Last active September 6, 2017 15:44
Show Gist options
  • Save csjx/d10858024ebc53effa5349c2544fa234 to your computer and use it in GitHub Desktop.
Save csjx/d10858024ebc53effa5349c2544fa234 to your computer and use it in GitHub Desktop.
An R script that queries the DataONE Node Registry and extracts a list of dates when each MN became operational
# Query the DataONE Node Registry to find dates that Member Nodes began operation
# The DataONE Node registry service returns results as XML
library(httr)
library(xml2)
# The base URL for the DataONE Coordinating Node
cn_base_url <- "https://cn.dataone.org/cn"
# The node registry endpoint
cn_node_service <- "/v2/node"
nodes_url <- paste(cn_base_url, cn_node_service, sep = "")
# Execute the query
request <- GET(nodes_url)
node_list <- read_xml(content(request, as = "text", encoding = "UTF-8"))
# Filter the XML node list:
# - for Member Nodes only (type = mn)
# - with identifiers that don't start with 'urn:node:mn' (D1 replica nodes)
# - with a CN_operational_date property
nodes <- xml_find_all(node_list,
"//node[@type='mn' and not(starts-with(identifier, 'urn:node:mn'))]/property[@key='CN_date_operational']")
# Construct a node date list of just the text values from the filtered XML
dates <- xml_text(nodes)
# FYI: the node_list XML above looks like this (some elements removed for brevity):
# <?xml version="1.0" encoding="UTF-8"?>
# <ns3:nodeList xmlns:ns2="http://ns.dataone.org/service/types/v1"
# xmlns:ns3="http://ns.dataone.org/service/types/v2.0">
# <node replicate="false" synchronize="true" type="mn" state="up">
# <identifier>urn:node:RW</identifier>
# <name>Research Workspace</name>
# <description>
# The Research Workspace is a web-based, scientific data management
# platform that allows researchers to store and share their data.
# </description>
# <baseURL>https://dataone.researchworkspace.com/mn</baseURL>
# <property key="CN_date_operational">2017-07-25T00:00:0.000Z</property>
# </node>
# ...
# </ns3:nodeList>
# Query the DataONE Index service to find uploads by month
# The DataONE index server can return results in CSV format,
# but not for faceted (summarized) results, so use the jsonlite
# library to parse the result
library(jsonlite)
# The base URL for the DataONE Coordinating Node
cn_base_url <- "https://cn.dataone.org/cn"
# The Solr query service endpoint
cn_query_service <- "/v2/query/solr/?"
# Construct a query string that:
# - limits records to METADATA or DATA format types
# - excludes obsoleted records (i.e. previous versions)
# and then return the response with
# - no raw results (0 rows)
# - no query response header (less complex result)
# - JSON format
query <- paste(
"q=formatType:(METADATA+OR+DATA)",
"+-obsoletedBy:*",
"&rows=0",
"&omitHeader=true",
"&wt=json",
sep = ""
)
# Configure how we summarize the data:
# - turn faceting on
# - summarize by range using the dateUploaded field
# - start the range in October, 2001 (the earliest upload date)
# - end the range with the current date
# - bin the results by month (%2B is a plus sign)
# - include months with 0 uploads
# - don't limit the size of the results returned
facet_settings <- paste(
"&facet=true",
"&facet.range=dateUploaded",
"&facet.range.start=2001-10-01T00:00:00Z",
"&facet.range.end=NOW",
"&facet.range.gap=%2B1MONTH",
"&facet.missing=true",
"&facet.limit=-1",
sep = ""
)
# Concatenate the above variables to create a query URL
uploads_by_month_url <- paste(cn_base_url, cn_query_service, query, facet_settings, sep = "")
# Execute the query
response <- fromJSON(uploads_by_month_url)
# And drill down into the JSON data structure to get uploads by month
# This results in an array of (month followed by count) pairs
uploads_by_month <- response$facet_counts$facet_ranges$dateUploaded$counts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment