Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active September 8, 2018 21:40
Show Gist options
  • Save dinovski/cb8539af2d2f3a78b7018dad20ab0ae4 to your computer and use it in GitHub Desktop.
Save dinovski/cb8539af2d2f3a78b7018dad20ab0ae4 to your computer and use it in GitHub Desktop.
get common variants from dbSNP
#/usr/bin/env/ python
usage="""
# get dbSNP file
wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp147Common.txt.gz' -O snp147Common.txt.gz
python dbsnp_clean.py <snp147Common.txt> <snp147common_clean.bed>
"""
if len(sys.argv) != 3:
print usage
sys.exit()
import sys
infile = open(sys.argv[1],'r')
outfile = open(sys.argv[2],'w')
for line in infile:
record = line.split("\t")
## exclude indels
if not "single" in record[11]:
continue
chrom = record[1]
new_chrom = chrom.replace('chr','')
end = record[3]
start = int(end) - 1
rsid = record[4]
num_alleles = record[21]
alleles = record[22]
an = record[23]
af = record[24]
#for i in range(len(record)):
# print(i, record[i])
## get major allele based on AF
my_af = af.rstrip(",").split(",")
my_alleles = alleles.rstrip(",").split(",")
## get index for max AF
major_ind = my_af.index(max(my_af))
major_allele = my_alleles[major_ind]
major_freq = my_af[major_ind]
"""
if not len(my_alleles) == len(my_af):
print record[4]
print "ALLELES: %s" % my_alleles
print "allele length: %s" % len(my_alleles)
print "af length %s" % len(my_af)
print "freqs: %s" % my_af
#print "index: %s" % major_ind
#print "major: %s" % major_allele
#print "freq: %s" % major_freq
"""
new_record=new_chrom + "\t" + str(start) + "\t" + end + "\t" + rsid + "\t" + major_allele + "\t" + major_freq + "\n"
outfile.write(new_record)
infile.close()
outfile.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment