alaniwi/get_ids_from_solr.py

## get_ids_from_solr.py
"""
Gets a list of dataset IDs from the index node via the ESGF search.
"""

import requests

class GetSolrIDs(object):

    def __init__(self,
                 url='https://c3s-models-index.ceda.ac.uk/esg-search/search/',
                 data_node='data.mips.copernicus-climate.eu',
                 verbose=False):
        self.url = url
        self.data_node = data_node
        self.verbose = verbose


    def _get_ids_page(self, start, count):

        if self.verbose:
            print("querying with start={}".format(start))

        field = 'instance_id'

        params = {'offset': start,
                  'limit': count,
                  'type': 'Dataset',
                  'data_node': self.data_node,
                  'format': 'application/solr+json',
                  'fields': field}

        resp = requests.get(self.url, params=params)
        content = resp.json()
        return [doc[field] for doc in content["response"]["docs"]]


    def get_ids(self, num_per_request=10000):

        all_ids = []
        start = 0
        while True:
            ids = self._get_ids_page(start, num_per_request)
            if not ids:
                return all_ids
            all_ids.extend(ids)
            start += num_per_request


def main():
    getter = GetSolrIDs()
    ids = getter.get_ids()
    for id in sorted(ids):
        print(id)


if __name__ == '__main__':
    main()
	"""
	Gets a list of dataset IDs from the index node via the ESGF search.
	"""

	import requests

	class GetSolrIDs(object):

	def __init__(self,
	url='https://c3s-models-index.ceda.ac.uk/esg-search/search/',
	data_node='data.mips.copernicus-climate.eu',
	verbose=False):
	self.url = url
	self.data_node = data_node
	self.verbose = verbose


	def _get_ids_page(self, start, count):

	if self.verbose:
	print("querying with start={}".format(start))

	field = 'instance_id'

	params = {'offset': start,
	'limit': count,
	'type': 'Dataset',
	'data_node': self.data_node,
	'format': 'application/solr+json',
	'fields': field}

	resp = requests.get(self.url, params=params)
	content = resp.json()
	return [doc[field] for doc in content["response"]["docs"]]


	def get_ids(self, num_per_request=10000):

	all_ids = []
	start = 0
	while True:
	ids = self._get_ids_page(start, num_per_request)
	if not ids:
	return all_ids
	all_ids.extend(ids)
	start += num_per_request


	def main():
	getter = GetSolrIDs()
	ids = getter.get_ids()
	for id in sorted(ids):
	print(id)


	if __name__ == '__main__':
	main()