Created
February 12, 2019 06:49
-
-
Save alaniwi/6c9354145929f3ab5ef0c9324f29a62e to your computer and use it in GitHub Desktop.
Gets a list of dataset IDs and projects from the index node via the ESGF search.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Gets a list of dataset IDs and projects from the index node via the ESGF search. | |
""" | |
import requests | |
class GetSolrIDs(object): | |
def __init__(self, | |
url='https://c3s-models-index.ceda.ac.uk/esg-search/search/', | |
data_node='data.mips.copernicus-climate.eu', | |
verbose=False): | |
self.url = url | |
self.data_node = data_node | |
self.verbose = verbose | |
def _get_fields_page(self, fields, start, count): | |
if self.verbose: | |
print("querying with start={}".format(start)) | |
params = {'offset': start, | |
'limit': count, | |
'type': 'Dataset', | |
'data_node': self.data_node, | |
'format': 'application/solr+json', | |
'fields': ','.join(fields)} | |
resp = requests.get(self.url, params=params) | |
content = resp.json() | |
return [tuple([doc.get(f) for f in fields]) | |
for doc in content["response"]["docs"]] | |
def get_fields(self, fields, num_per_request=10000): | |
all_ids = [] | |
start = 0 | |
while True: | |
ids = self._get_fields_page(fields, start, num_per_request) | |
if not ids: | |
return all_ids | |
all_ids.extend(ids) | |
start += num_per_request | |
def main(): | |
getter = GetSolrIDs() | |
fields = getter.get_fields(['instance_id', 'project']) | |
for id, project in sorted(fields): | |
print(id, project) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
adapted from https://gist.github.com/alaniwi/f5ab883275ca416fd2bd69ada01e4334
This one gets a specified list of fields per document rather than just the one.