-
-
Save SandyRogers/5d9eff7f1f7b08cfa40265f5e2adf9cd to your computer and use it in GitHub Desktop.
# Using common libraries. | |
# | |
# Dependencies: | |
# pandas jsonapi-client | |
# Install them from the command line, with e.g. | |
# $ pip install pandas jsonapi-client | |
from jsonapi_client import Session | |
import pandas as pd | |
# See https://www.ebi.ac.uk/metagenomics/api/docs/ for endpoints and API documentation. | |
endpoint = 'super-studies' | |
with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify: | |
resources = map(lambda r: r.json, mgnify.iterate(endpoint)) | |
resources = pd.json_normalize(resources) | |
resources.to_csv(f"{endpoint}.csv") |
# Using Python 3 standard library only, with no extra python packages needed: | |
import urllib.request | |
import json | |
import csv | |
# See https://www.ebi.ac.uk/metagenomics/api/docs/ for endpoints and API documentation | |
# including attributes you may want as CSV columns. | |
endpoint = 'super-studies' | |
attribute_columns = ["super-study-id", "title", "description"] | |
def get_page(url): | |
next_url = url | |
while next_url: | |
with urllib.request.urlopen(next_url) as page: | |
response = json.loads(page.read().decode()) | |
data = response['data'] | |
yield data | |
next_url = response['links']['next'] | |
with open(f"{endpoint}.csv", "w") as csv_file: | |
c = csv.writer(csv_file) | |
c.writerow(attribute_columns) | |
for page in get_page(f"https://www.ebi.ac.uk/metagenomics/api/v1/{endpoint}"): | |
for resource in page: | |
c.writerow([resource['attributes'].get(col) for col in attribute_columns]) |
# Using common libraries. | |
# | |
# Dependencies: | |
# pandas jsonapi-client | |
# Install them from the command line, with e.g. | |
# $ pip install pandas jsonapi-client | |
# This example includes a Filter. | |
# You can explore the Filters available for each endpoint using the interactive API browser: | |
# https://www.ebi.ac.uk/metagenomics/api/v1 | |
# | |
# If you don't use any Filters, endpoints with many pages of data (like /samples) might fail. | |
from jsonapi_client import Session, Filter | |
import pandas as pd | |
class MGnifyFilter(Filter): | |
def format_filter_query(self, **kwargs: 'FilterKeywords') -> str: | |
""" | |
The MGnify API uses a slimmer syntax for filters than the JSON:API default. | |
Filter keywords are not wrapped in by the word "filter", like, filter[foo]=bar, | |
but are instead plain, like foo=bar. | |
""" | |
def jsonify_key(key): | |
return key.replace('__', '.').replace('_', '-') | |
return '&'.join(f'{jsonify_key(key)}={value}' | |
for key, value in kwargs.items()) | |
endpoint = 'samples' | |
filters = { | |
'lineage': 'root:Engineered:Bioremediation' | |
} | |
with Session("https://www.ebi.ac.uk/metagenomics/api/v1/") as mgnify: | |
resources = map(lambda r: r.json, mgnify.iterate(endpoint, MGnifyFilter(**filters))) | |
resources = pd.json_normalize(resources) | |
resources.to_csv(f"{endpoint}.csv") |
@taylorreiter
This is likely because /samples
returns a lot of pages of data. So, you have two strategies to pick from:
- Use Filters to limit the amount of data you're requesting. This is almost always what you want to do in a real use case. I've added a third example to this gist, showing how to do that.
- Throttle your requests to limit how quickly you're trying to pull data, if you really do need a lot of unfiltered data. I haven't included a copy-and-paste example of this here because it's usually not the most efficient approach (compared to limiting the data you request in the first place). You can achieve it using
jsonapi_client
package, e.g. usingpage = mgnify.fetch_document_by_url(f'{mgnify.url_prefix}/{endpoint}')
, and after atime.sleep
, followingpage.links.next
etc.
Please feel free to contact the MGnify helpdesk if you need a large query / dataset and aren't able to access it this way.
Thank you @SandyRogers! I did need the whole set of metadata, but your second gist worked to retrieve. I had tried to use filters to download it piece-meal and couldn't figure it out, so the third example is a wonderful new resource.
This is the notebook I ended up with: https://github.com/taylorreiter/2022-sra-gather/blob/main/notebooks/20220318_fetch_paginated_data_ebi.ipynb
I really appreciate you making these gists available, they were instrumental in figuring out how to use the api!
Brilliant, thanks @taylorreiter for the feedback. I will add some more examples of fetching paginated data to our Jupyter Lab Notebooks as well :)
fetch_paginated_mgnify_data.py
works for me when I usedbiomes
orsuper-studies
, but fails with this error for endpointsamples