csjx/dataone-node-operation-dates.R

## dataone-node-operation-dates.R
# Query the DataONE Node Registry to find dates that Member Nodes began operation

# The DataONE Node registry service returns results as XML
library(httr)
library(xml2)

# The base URL for the DataONE Coordinating Node
cn_base_url <- "https://cn.dataone.org/cn"

# The node registry endpoint
cn_node_service <- "/v2/node"

nodes_url <- paste(cn_base_url, cn_node_service, sep = "")

# Execute the query
request <- GET(nodes_url)
node_list <- read_xml(content(request, as = "text", encoding = "UTF-8"))

# Filter the XML node list:
# - for Member Nodes only (type = mn)
# - with identifiers that don't start with 'urn:node:mn' (D1 replica nodes)
# - with a CN_operational_date property
nodes <- xml_find_all(node_list,
    "//node[@type='mn' and not(starts-with(identifier, 'urn:node:mn'))]/property[@key='CN_date_operational']")

# Construct a node date list of just the text values from the filtered XML
dates <- xml_text(nodes)


# FYI: the node_list XML above looks like this (some elements removed for brevity):
# <?xml version="1.0" encoding="UTF-8"?>
# <ns3:nodeList xmlns:ns2="http://ns.dataone.org/service/types/v1"
#               xmlns:ns3="http://ns.dataone.org/service/types/v2.0">
#     <node replicate="false" synchronize="true" type="mn" state="up">
#       <identifier>urn:node:RW</identifier>
#       <name>Research Workspace</name>
#       <description>
#         The Research Workspace is a web-based, scientific data management
#         platform that allows researchers to store and share their data.
#       </description>
#       <baseURL>https://dataone.researchworkspace.com/mn</baseURL>
#       <property key="CN_date_operational">2017-07-25T00:00:0.000Z</property>
#     </node>
#     ...
# </ns3:nodeList>

## dataone-upload-counts-by-month.R
# Query the DataONE Index service to find uploads by month

# The DataONE index server can return results in CSV format,
# but not for faceted (summarized) results, so use the jsonlite
# library to parse the result
library(jsonlite)

# The base URL for the DataONE Coordinating Node
cn_base_url <- "https://cn.dataone.org/cn"

# The Solr query service endpoint
cn_query_service <- "/v2/query/solr/?"

# Construct a query string that:
# - limits records to METADATA or DATA format types
# - excludes obsoleted records (i.e. previous versions)
# and then return the response with
# - no raw results (0 rows)
# - no query response header (less complex result)
# - JSON format
query <- paste(
  "q=formatType:(METADATA+OR+DATA)",
  "+-obsoletedBy:*",
  "&rows=0",
  "&omitHeader=true",
  "&wt=json",
  sep = ""
)

# Configure how we summarize the data:
# - turn faceting on
# - summarize by range using the dateUploaded field
# - start the range in October, 2001 (the earliest upload date)
# - end the range with the current date
# - bin the results by month (%2B is a plus sign)
# - include months with 0 uploads
# - don't limit the size of the results returned
facet_settings <- paste(
  "&facet=true",
  "&facet.range=dateUploaded",
  "&facet.range.start=2001-10-01T00:00:00Z",
  "&facet.range.end=NOW",
  "&facet.range.gap=%2B1MONTH",
  "&facet.missing=true",
  "&facet.limit=-1",
  sep = ""
)

# Concatenate the above variables to create a query URL
uploads_by_month_url <- paste(cn_base_url, cn_query_service, query, facet_settings, sep = "")

# Execute the query
response <- fromJSON(uploads_by_month_url)

# And drill down into the JSON data structure to get uploads by month
# This results in an array of (month followed by count) pairs
uploads_by_month <- response$facet_counts$facet_ranges$dateUploaded$counts
	# Query the DataONE Node Registry to find dates that Member Nodes began operation

	# The DataONE Node registry service returns results as XML
	library(httr)
	library(xml2)

	# The base URL for the DataONE Coordinating Node
	cn_base_url <- "https://cn.dataone.org/cn"

	# The node registry endpoint
	cn_node_service <- "/v2/node"

	nodes_url <- paste(cn_base_url, cn_node_service, sep = "")

	# Execute the query
	request <- GET(nodes_url)
	node_list <- read_xml(content(request, as = "text", encoding = "UTF-8"))

	# Filter the XML node list:
	# - for Member Nodes only (type = mn)
	# - with identifiers that don't start with 'urn:node:mn' (D1 replica nodes)
	# - with a CN_operational_date property
	nodes <- xml_find_all(node_list,
	"//node[@type='mn' and not(starts-with(identifier, 'urn:node:mn'))]/property[@key='CN_date_operational']")

	# Construct a node date list of just the text values from the filtered XML
	dates <- xml_text(nodes)



	# FYI: the node_list XML above looks like this (some elements removed for brevity):
	# <?xml version="1.0" encoding="UTF-8"?>
	# <ns3:nodeList xmlns:ns2="http://ns.dataone.org/service/types/v1"
	# xmlns:ns3="http://ns.dataone.org/service/types/v2.0">
	# <node replicate="false" synchronize="true" type="mn" state="up">
	# <identifier>urn:node:RW</identifier>
	# <name>Research Workspace</name>
	# <description>
	# The Research Workspace is a web-based, scientific data management
	# platform that allows researchers to store and share their data.
	# </description>
	# <baseURL>https://dataone.researchworkspace.com/mn</baseURL>
	# <property key="CN_date_operational">2017-07-25T00:00:0.000Z</property>
	# </node>
	# ...
	# </ns3:nodeList>
	# Query the DataONE Index service to find uploads by month

	# The DataONE index server can return results in CSV format,
	# but not for faceted (summarized) results, so use the jsonlite
	# library to parse the result
	library(jsonlite)

	# The base URL for the DataONE Coordinating Node
	cn_base_url <- "https://cn.dataone.org/cn"

	# The Solr query service endpoint
	cn_query_service <- "/v2/query/solr/?"

	# Construct a query string that:
	# - limits records to METADATA or DATA format types
	# - excludes obsoleted records (i.e. previous versions)
	# and then return the response with
	# - no raw results (0 rows)
	# - no query response header (less complex result)
	# - JSON format
	query <- paste(
	"q=formatType:(METADATA+OR+DATA)",
	"+-obsoletedBy:*",
	"&rows=0",
	"&omitHeader=true",
	"&wt=json",
	sep = ""
	)

	# Configure how we summarize the data:
	# - turn faceting on
	# - summarize by range using the dateUploaded field
	# - start the range in October, 2001 (the earliest upload date)
	# - end the range with the current date
	# - bin the results by month (%2B is a plus sign)
	# - include months with 0 uploads
	# - don't limit the size of the results returned
	facet_settings <- paste(
	"&facet=true",
	"&facet.range=dateUploaded",
	"&facet.range.start=2001-10-01T00:00:00Z",
	"&facet.range.end=NOW",
	"&facet.range.gap=%2B1MONTH",
	"&facet.missing=true",
	"&facet.limit=-1",
	sep = ""
	)

	# Concatenate the above variables to create a query URL
	uploads_by_month_url <- paste(cn_base_url, cn_query_service, query, facet_settings, sep = "")

	# Execute the query
	response <- fromJSON(uploads_by_month_url)

	# And drill down into the JSON data structure to get uploads by month
	# This results in an array of (month followed by count) pairs
	uploads_by_month <- response$facet_counts$facet_ranges$dateUploaded$counts