Skip to content

Instantly share code, notes, and snippets.

@carlosribas
Last active February 21, 2023 12:06
Show Gist options
  • Save carlosribas/b687b5624ba47eb1c68a0c084ea52efa to your computer and use it in GitHub Desktop.
Save carlosribas/b687b5624ba47eb1c68a0c084ea52efa to your computer and use it in GitHub Desktop.
"""
Copyright [2009-present] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Usage: python get_ids.py [database]
Example: python get_ids.py rfam
"""
import itertools
import json
import math
import requests # pip install requests
import sys
def get_query(database, start):
"""
Function to build the query
:param database: name of the Expert DB
:param start: index of the first entry to fetch
:return: query
"""
query = "?query=entry_type:metadata%20AND%20database:" \
+ database \
+ "&fields=job_id&size=100&start=" \
+ str(start) \
+ "&format=json"
return query
def get_results(database):
"""
Function to get the list of ids from a given database
:param database: name of the Expert DB
:return: results
"""
start = 0
endpoint = "https://wwwdev.ebi.ac.uk/ebisearch/ws/rest/rnacentral-litscan"
query = get_query(database, start)
db_request = requests.get(endpoint + query)
hit_count = json.loads(db_request.text)["hitCount"]
results = []
if hit_count > 0:
# get ids
get_result = json.loads(db_request.text)
entries = [get_result["entries"]]
results.append([item["fields"]["job_id"][0] for sublist in entries for item in sublist])
# fetch other ids if needed
iter_number = int(math.ceil(hit_count / 100.0))
for num in range(iter_number - 1):
start += 100
query = get_query(database, start)
new_request = requests.get(endpoint + query)
new_request_result = json.loads(new_request.text)
new_entries = [new_request_result["entries"]]
results.append([item["fields"]["job_id"][0] for sublist in new_entries for item in sublist])
return results
def main():
database = None
if len(sys.argv) == 1:
print("You must specify the database")
exit()
elif len(sys.argv) == 2:
database = sys.argv[1]
else:
print("Usage: python get_ids.py rfam")
exit()
if database:
# get list of ids
results = get_results(database)
if results:
results = list(itertools.chain.from_iterable(results))
results = sorted(set(results))
# save results
with open(database + '_ids.txt', 'w') as f:
for item in results:
f.write(item + '\n')
else:
print("No id found for database {}".format(database))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment