alaniwi/get_fields_from_solr.py

## get_fields_from_solr.py
"""
Gets a list of dataset IDs and projects from the index node via the ESGF search.
"""

import requests

class GetSolrIDs(object):

    def __init__(self,
                 url='https://c3s-models-index.ceda.ac.uk/esg-search/search/',
                 data_node='data.mips.copernicus-climate.eu',
                 verbose=False):
        self.url = url
        self.data_node = data_node
        self.verbose = verbose


    def _get_fields_page(self, fields, start, count):

        if self.verbose:
            print("querying with start={}".format(start))

        params = {'offset': start,
                  'limit': count,
                  'type': 'Dataset',
                  'data_node': self.data_node,
                  'format': 'application/solr+json',
                  'fields': ','.join(fields)}

        resp = requests.get(self.url, params=params)
        content = resp.json()
        return [tuple([doc.get(f) for f in fields])
                 for doc in content["response"]["docs"]]


    def get_fields(self, fields, num_per_request=10000):

        all_ids = []
        start = 0
        while True:
            ids = self._get_fields_page(fields, start, num_per_request)
            if not ids:
                return all_ids
            all_ids.extend(ids)
            start += num_per_request


def main():

    getter = GetSolrIDs()

    fields = getter.get_fields(['instance_id', 'project'])

    for id, project in sorted(fields):
        print(id, project)


if __name__ == '__main__':
    main()
	"""
	Gets a list of dataset IDs and projects from the index node via the ESGF search.
	"""

	import requests

	class GetSolrIDs(object):

	def __init__(self,
	url='https://c3s-models-index.ceda.ac.uk/esg-search/search/',
	data_node='data.mips.copernicus-climate.eu',
	verbose=False):
	self.url = url
	self.data_node = data_node
	self.verbose = verbose


	def _get_fields_page(self, fields, start, count):

	if self.verbose:
	print("querying with start={}".format(start))

	params = {'offset': start,
	'limit': count,
	'type': 'Dataset',
	'data_node': self.data_node,
	'format': 'application/solr+json',
	'fields': ','.join(fields)}

	resp = requests.get(self.url, params=params)
	content = resp.json()
	return [tuple([doc.get(f) for f in fields])
	for doc in content["response"]["docs"]]


	def get_fields(self, fields, num_per_request=10000):

	all_ids = []
	start = 0
	while True:
	ids = self._get_fields_page(fields, start, num_per_request)
	if not ids:
	return all_ids
	all_ids.extend(ids)
	start += num_per_request


	def main():

	getter = GetSolrIDs()

	fields = getter.get_fields(['instance_id', 'project'])

	for id, project in sorted(fields):
	print(id, project)


	if __name__ == '__main__':
	main()