Skip to content

Instantly share code, notes, and snippets.

@alaniwi
Created February 12, 2019 06:49
Show Gist options
  • Save alaniwi/6c9354145929f3ab5ef0c9324f29a62e to your computer and use it in GitHub Desktop.
Save alaniwi/6c9354145929f3ab5ef0c9324f29a62e to your computer and use it in GitHub Desktop.
Gets a list of dataset IDs and projects from the index node via the ESGF search.
"""
Gets a list of dataset IDs and projects from the index node via the ESGF search.
"""
import requests
class GetSolrIDs(object):
def __init__(self,
url='https://c3s-models-index.ceda.ac.uk/esg-search/search/',
data_node='data.mips.copernicus-climate.eu',
verbose=False):
self.url = url
self.data_node = data_node
self.verbose = verbose
def _get_fields_page(self, fields, start, count):
if self.verbose:
print("querying with start={}".format(start))
params = {'offset': start,
'limit': count,
'type': 'Dataset',
'data_node': self.data_node,
'format': 'application/solr+json',
'fields': ','.join(fields)}
resp = requests.get(self.url, params=params)
content = resp.json()
return [tuple([doc.get(f) for f in fields])
for doc in content["response"]["docs"]]
def get_fields(self, fields, num_per_request=10000):
all_ids = []
start = 0
while True:
ids = self._get_fields_page(fields, start, num_per_request)
if not ids:
return all_ids
all_ids.extend(ids)
start += num_per_request
def main():
getter = GetSolrIDs()
fields = getter.get_fields(['instance_id', 'project'])
for id, project in sorted(fields):
print(id, project)
if __name__ == '__main__':
main()
@alaniwi
Copy link
Author

alaniwi commented Feb 12, 2019

adapted from https://gist.github.com/alaniwi/f5ab883275ca416fd2bd69ada01e4334

This one gets a specified list of fields per document rather than just the one.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment