Last active
February 6, 2020 17:01
-
-
Save petermchale/208b0104510097b9766dd9837d090e13 to your computer and use it in GitHub Desktop.
Download "self chains"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
import argparse | |
import sys | |
import gzip | |
import os | |
parser = argparse.ArgumentParser(description='fetch and parse self chain data from UCSC database') | |
parser.add_argument('--genome_build', type=str, help='e.g., "hg19" or "hg38"') | |
args = parser.parse_args() | |
gzip_file = 'self_chains.txt.gz' | |
if not os.path.exists(gzip_file): | |
url = "http://hgdownload.cse.ucsc.edu/goldenPath/" + args.genome_build + "/database/chainSelf.txt.gz" | |
response = requests.get(url) | |
if response.status_code == 200: | |
with open(gzip_file, 'wb') as f: | |
f.write(response.content) | |
else: | |
sys.exit('HTTP request not successful!') | |
for line in gzip.open(gzip_file, 'rt'): | |
_bin, score, tName, tSize, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, _id, normScore = \ | |
line.rstrip().split("\t") | |
# only consider regions that are very similar to one another | |
if float(normScore) < 90: continue | |
# not all query regions are also target regions and vice versa... | |
# ... so list both queries and targets | |
# ... and later remove duplicates | |
print("{}\t{}\t{}".format(tName, tStart, tEnd)) | |
print("{}\t{}\t{}".format(qName, qStart, qEnd)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
script=download_self_chains.py | |
genome_build=hg38 | |
bed=self_chains.sorted.bed.gz | |
if [[ ! -e $bed ]]; then | |
echo 'downloading, unzipping, bed-ifying self chain data....' | |
# extract relevant columns from database to make a valid bed file | |
./${script} --genome_build ${genome_build} | \ | |
sed 's/^chr//g' | \ | |
sort -k1,1 -k2,2n | \ | |
uniq | \ | |
bgzip --stdout > $bed | |
tabix --force $bed | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment