Last active
August 21, 2024 11:05
-
-
Save odinokov/89085c9aa71c3f9afc06a1d4bc4fd33a to your computer and use it in GitHub Desktop.
The script fetches and prints genetic variants from the gnomAD database for a specified genomic region
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The script fetches and prints genetic variants from the gnomAD database for a specified genomic region. | |
# Adapted from https://gist.github.com/ressy/6fd7f6ee6401ac8e703dc2709399869e | |
import re | |
import sys | |
from pprint import PrettyPrinter | |
from typing import Any, Dict, Tuple | |
import fire | |
import requests | |
import tenacity | |
from loguru import logger | |
class GnomadClient: | |
def __init__( | |
self, | |
url: str = "https://gnomad.broadinstitute.org/api", | |
dataset: str = "gnomad_r4", | |
reference_genome: str = "GRCh38", | |
max_retries: int = 3, | |
retry_delay: int = 2, | |
verbose: bool = False, | |
): | |
self.url = url | |
self.dataset = dataset | |
self.reference_genome = reference_genome | |
self.max_retries = max_retries | |
self.retry_delay = retry_delay | |
self._configure_logging(verbose) | |
def _configure_logging(self, verbose: bool) -> None: | |
""" | |
Configures logging based on verbosity level. | |
""" | |
logger.remove() # Remove the default logger configuration | |
level = "DEBUG" if verbose else "INFO" | |
# Add colorful logging output to the console | |
logger.add( | |
sys.stdout, | |
level=level, | |
colorize=True, | |
# format="<green>{time}</green> <level>{message}</level>", | |
) | |
def fetch(self, jsondata: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Sends a POST request to the specified URL with the provided JSON data, with retry logic. | |
""" | |
@tenacity.retry( | |
wait=tenacity.wait_fixed(self.retry_delay), | |
stop=tenacity.stop_after_attempt(self.max_retries), | |
reraise=True, | |
) | |
def _make_request(): | |
logger.debug(f"Sending request to {self.url} with data: {jsondata}") | |
response = requests.post( | |
self.url, json=jsondata, headers={"Content-Type": "application/json"} | |
) | |
response.raise_for_status() | |
return self._handle_response(response) | |
return _make_request() | |
def _handle_response(self, response: requests.Response) -> Dict[str, Any]: | |
""" | |
Handles the response, checking for errors in the JSON data. | |
""" | |
response_json = response.json() | |
if "errors" in response_json: | |
raise Exception(str(response_json["errors"])) | |
return response_json | |
@staticmethod | |
def parse_region(region: str) -> Tuple[str, int, int]: | |
""" | |
Parses a genomic region string into chromosome, start, and stop positions using regular expressions. | |
Removes any "chr" prefix before parsing. | |
Parameters: | |
- region: Genomic region in the format 'chromosome:start-stop' | |
Returns: | |
- Tuple of (chromosome, start, stop) positions. | |
""" | |
region = region.lstrip("chr") # Remove "chr" prefix if present | |
match = re.match( | |
r"^(?P<chromosome>[^:]+):(?P<start>\d+)-(?P<stop>\d+)$", region | |
) | |
if not match: | |
raise ValueError( | |
f"Invalid region format: '{region}'. Expected format 'chromosome:start-stop'." | |
) | |
return ( | |
match.group("chromosome"), | |
int(match.group("start")), | |
int(match.group("stop")), | |
) | |
def get_variant_list(self, region: str) -> Any: | |
""" | |
Fetches the list of variants for a given genomic region. | |
Parameters: | |
- region: Genomic region in the format 'chromosome:start-stop' | |
Returns: | |
- List of variants in the specified region. | |
""" | |
chrom, start, stop = self.parse_region(region) | |
query = f""" | |
{{ | |
region(chrom: "{chrom}", start: {start}, stop: {stop}, reference_genome: {self.reference_genome}) {{ | |
variants(dataset: {self.dataset}) {{ | |
consequence | |
pos | |
rsid | |
variant_id: variantId | |
exome {{ | |
ac | |
an | |
af | |
}} | |
genome {{ | |
ac | |
an | |
af | |
}} | |
allele: alt | |
reference: ref | |
}} | |
}} | |
}} | |
""" | |
return self.fetch({"query": query, "variables": {}})["data"]["region"][ | |
"variants" | |
] | |
def main( | |
region: str, | |
dataset: str = "gnomad_r4", | |
reference_genome: str = "GRCh38", | |
max_retries: int = 3, | |
retry_delay: int = 2, | |
verbose: bool = False, | |
): | |
""" | |
Fetches and prints genetic variants from the gnomAD database for a specified genomic region. | |
Parameters: | |
- region: Genomic region in the format 'chromosome:start-stop' | |
- dataset: Dataset name (default: "gnomad_r4") | |
- reference_genome: Reference genome (default: "GRCh38") | |
- max_retries: Maximum number of retry attempts (default: 3) | |
- retry_delay: Delay between retries in seconds (default: 2) | |
- verbose: Verbose output for debugging (default: False) | |
""" | |
try: | |
client = GnomadClient( | |
dataset=dataset, | |
reference_genome=reference_genome, | |
max_retries=max_retries, | |
retry_delay=retry_delay, | |
verbose=verbose, | |
) | |
variants = client.get_variant_list(region) | |
PrettyPrinter(indent=2, width=40, depth=None, compact=False).pprint(*variants) | |
except Exception as e: | |
logger.error(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
fire.Fire(main) | |
# https://gnomad.broadinstitute.org/region/12-67868595-67868595?dataset=gnomad_r4 | |
# fetch_gnomad.py --region chr12:67868595-67868595 | |
# { 'allele': 'T', | |
# 'consequence': 'intron_variant', | |
# 'exome': None, | |
# 'genome': { 'ac': 3, | |
# 'af': 1.974957538412924e-05, | |
# 'an': 151902}, | |
# 'pos': 67868595, | |
# 'reference': 'C', | |
# 'rsid': 'rs945346526', | |
# 'variant_id': '12-67868595-C-T'} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment