-
-
Save brentp/e8b400bf8bfb297c8d49f9673e32d2e4 to your computer and use it in GitHub Desktop.
blog gists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from collections import Counter | |
import scipy.stats as ss | |
import cyvcf2 | |
vcf = cyvcf2.VCF(sys.argv[1], gts012=True) | |
o = cyvcf2.Writer("depleted.vcf", vcf) | |
genome = cyvcf2.VCF('gnomad.genomes.r2.0.2.sites.chrX.vcf.bgz') | |
n = 0 | |
nsig = 0 | |
genes = [] | |
for v in vcf("X:2781479-155701382"): | |
if v.FILTER is not None: continue | |
if v.INFO.get("FS", 0) > 10: continue | |
try: | |
gcm = v.INFO["GC_Male"] | |
gcf = v.INFO["GC_Female"] | |
except KeyError: | |
continue | |
if v.INFO.get('OLD_MULTIALLELIC'): continue | |
n += 1 | |
# if any males have hom-alt or no females are het, continue | |
if gcm[-1] > 0: continue | |
if gcf[-1] != 0: continue | |
#print gcm | |
#print gcf, gcm | |
#female_af = gcf[1] | |
# NOTE: these should be removed for unbiased test, but we know that we need these contrainsts for significance for gnomad | |
if gcf[1] < 15: continue | |
if gcf[1] < gcm[1]: continue | |
if gcm[1] > 2: continue | |
success_prob = gcf[1] / float(v.INFO["AN_Female"]) | |
p = ss.binom_test(0, v.INFO["AN_Male"], p=success_prob) | |
if p > 1e-8: | |
continue | |
variant_in_genome = False | |
in_males_in_genome = False | |
filtered_in_genome = False | |
for gv in genome('X:%d-%d' % (v.start, v.end)): | |
if gv.start != v.start or gv.end != v.end or gv.REF != v.REF: continue | |
variant_in_genome = True | |
gvm = gv.INFO["GC_Male"] | |
if gv.FILTER is not None: | |
filtered_in_genome = True | |
if gvm[1] > 0 or gvm[2] > 0: | |
in_males_in_genome = True | |
if not variant_in_genome: | |
print >>sys.stderr, "variant not found in genomes" | |
continue | |
if filtered_in_genome: | |
print >>sys.stderr, "variant was filtered in genomes" | |
continue | |
if in_males_in_genome: | |
print >>sys.stderr, "variant found in males in genomes" | |
continue | |
nsig += 1 | |
o.write_record(v) | |
csqs = v.INFO["CSQ"].split(",") | |
igene = [] | |
for x in csqs: | |
x = x.split("|") | |
if x[3] == "": | |
continue | |
igene.append(x[3]) | |
genes.extend(set(igene)) | |
print "#################", v.CHROM + ":" + str(v.start), " id:", v.ID, " nsig:", nsig, " out of:", n, (" ratio: %.6f" % (nsig / float(n))) | |
print "p-value: %.4g" % p | |
print "gc-male:", v.INFO["GC_Male"], " gc-female:", v.INFO["GC_Female"] | |
print "an-male:", v.INFO["AN_Male"], " an-female:", v.INFO["AN_Female"] | |
print "ac-male:", v.INFO["AC_Male"], " ac-female:", v.INFO["AC_Female"] | |
print "test: %d of %d with expected of %d/%d == %.4f" % (gcm[1], v.INFO["AN_Male"], gcf[1], v.INFO["AN_Female"], success_prob) | |
print "tested %d sites. p-value cutoff is: %.4g" % (n, 0.05 / n) | |
o.close() | |
print "\n".join("%s\t%d" % (gene, count) for gene, count in Counter(genes).most_common()) | |
print len(set(genes)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment