carlosribas/get_ids.py

## get_ids.py
"""
Copyright [2009-present] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Usage: python get_ids.py [database]
Example: python get_ids.py rfam
"""
import itertools
import json
import math
import requests  # pip install requests
import sys


def get_query(database, start):
    """
    Function to build the query
    :param database: name of the Expert DB
    :param start: index of the first entry to fetch
    :return: query
    """
    query = "?query=entry_type:metadata%20AND%20database:" \
            + database \
            + "&fields=job_id&size=100&start=" \
            + str(start) \
            + "&format=json"
    return query


def get_results(database):
    """
    Function to get the list of ids from a given database
    :param database: name of the Expert DB
    :return: results
    """
    start = 0
    endpoint = "https://wwwdev.ebi.ac.uk/ebisearch/ws/rest/rnacentral-litscan"
    query = get_query(database, start)
    db_request = requests.get(endpoint + query)
    hit_count = json.loads(db_request.text)["hitCount"]
    results = []

    if hit_count > 0:
        # get ids
        get_result = json.loads(db_request.text)
        entries = [get_result["entries"]]
        results.append([item["fields"]["job_id"][0] for sublist in entries for item in sublist])

        # fetch other ids if needed
        iter_number = int(math.ceil(hit_count / 100.0))
        for num in range(iter_number - 1):
            start += 100
            query = get_query(database, start)
            new_request = requests.get(endpoint + query)
            new_request_result = json.loads(new_request.text)
            new_entries = [new_request_result["entries"]]
            results.append([item["fields"]["job_id"][0] for sublist in new_entries for item in sublist])

        return results


def main():
    database = None

    if len(sys.argv) == 1:
        print("You must specify the database")
        exit()
    elif len(sys.argv) == 2:
        database = sys.argv[1]
    else:
        print("Usage: python get_ids.py rfam")
        exit()

    if database:
        # get list of ids
        results = get_results(database)

        if results:
            results = list(itertools.chain.from_iterable(results))
            results = sorted(set(results))

            # save results
            with open(database + '_ids.txt', 'w') as f:
                for item in results:
                    f.write(item + '\n')
        else:
            print("No id found for database {}".format(database))


if __name__ == "__main__":
    main()
	"""
	Copyright [2009-present] EMBL-European Bioinformatics Institute
	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at
	http://www.apache.org/licenses/LICENSE-2.0
	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.

	Usage: python get_ids.py [database]
	Example: python get_ids.py rfam
	"""
	import itertools
	import json
	import math
	import requests # pip install requests
	import sys


	def get_query(database, start):
	"""
	Function to build the query
	:param database: name of the Expert DB
	:param start: index of the first entry to fetch
	:return: query
	"""
	query = "?query=entry_type:metadata%20AND%20database:" \
	+ database \
	+ "&fields=job_id&size=100&start=" \
	+ str(start) \
	+ "&format=json"
	return query


	def get_results(database):
	"""
	Function to get the list of ids from a given database
	:param database: name of the Expert DB
	:return: results
	"""
	start = 0
	endpoint = "https://wwwdev.ebi.ac.uk/ebisearch/ws/rest/rnacentral-litscan"
	query = get_query(database, start)
	db_request = requests.get(endpoint + query)
	hit_count = json.loads(db_request.text)["hitCount"]
	results = []

	if hit_count > 0:
	# get ids
	get_result = json.loads(db_request.text)
	entries = [get_result["entries"]]
	results.append([item["fields"]["job_id"][0] for sublist in entries for item in sublist])

	# fetch other ids if needed
	iter_number = int(math.ceil(hit_count / 100.0))
	for num in range(iter_number - 1):
	start += 100
	query = get_query(database, start)
	new_request = requests.get(endpoint + query)
	new_request_result = json.loads(new_request.text)
	new_entries = [new_request_result["entries"]]
	results.append([item["fields"]["job_id"][0] for sublist in new_entries for item in sublist])

	return results


	def main():
	database = None

	if len(sys.argv) == 1:
	print("You must specify the database")
	exit()
	elif len(sys.argv) == 2:
	database = sys.argv[1]
	else:
	print("Usage: python get_ids.py rfam")
	exit()

	if database:
	# get list of ids
	results = get_results(database)

	if results:
	results = list(itertools.chain.from_iterable(results))
	results = sorted(set(results))

	# save results
	with open(database + '_ids.txt', 'w') as f:
	for item in results:
	f.write(item + '\n')
	else:
	print("No id found for database {}".format(database))


	if __name__ == "__main__":
	main()