Skip to content

Instantly share code, notes, and snippets.

@brentp
Created March 15, 2018 02:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brentp/e8b400bf8bfb297c8d49f9673e32d2e4 to your computer and use it in GitHub Desktop.
Save brentp/e8b400bf8bfb297c8d49f9673e32d2e4 to your computer and use it in GitHub Desktop.
blog gists
import sys
from collections import Counter
import scipy.stats as ss
import cyvcf2
vcf = cyvcf2.VCF(sys.argv[1], gts012=True)
o = cyvcf2.Writer("depleted.vcf", vcf)
genome = cyvcf2.VCF('gnomad.genomes.r2.0.2.sites.chrX.vcf.bgz')
n = 0
nsig = 0
genes = []
for v in vcf("X:2781479-155701382"):
if v.FILTER is not None: continue
if v.INFO.get("FS", 0) > 10: continue
try:
gcm = v.INFO["GC_Male"]
gcf = v.INFO["GC_Female"]
except KeyError:
continue
if v.INFO.get('OLD_MULTIALLELIC'): continue
n += 1
# if any males have hom-alt or no females are het, continue
if gcm[-1] > 0: continue
if gcf[-1] != 0: continue
#print gcm
#print gcf, gcm
#female_af = gcf[1]
# NOTE: these should be removed for unbiased test, but we know that we need these contrainsts for significance for gnomad
if gcf[1] < 15: continue
if gcf[1] < gcm[1]: continue
if gcm[1] > 2: continue
success_prob = gcf[1] / float(v.INFO["AN_Female"])
p = ss.binom_test(0, v.INFO["AN_Male"], p=success_prob)
if p > 1e-8:
continue
variant_in_genome = False
in_males_in_genome = False
filtered_in_genome = False
for gv in genome('X:%d-%d' % (v.start, v.end)):
if gv.start != v.start or gv.end != v.end or gv.REF != v.REF: continue
variant_in_genome = True
gvm = gv.INFO["GC_Male"]
if gv.FILTER is not None:
filtered_in_genome = True
if gvm[1] > 0 or gvm[2] > 0:
in_males_in_genome = True
if not variant_in_genome:
print >>sys.stderr, "variant not found in genomes"
continue
if filtered_in_genome:
print >>sys.stderr, "variant was filtered in genomes"
continue
if in_males_in_genome:
print >>sys.stderr, "variant found in males in genomes"
continue
nsig += 1
o.write_record(v)
csqs = v.INFO["CSQ"].split(",")
igene = []
for x in csqs:
x = x.split("|")
if x[3] == "":
continue
igene.append(x[3])
genes.extend(set(igene))
print "#################", v.CHROM + ":" + str(v.start), " id:", v.ID, " nsig:", nsig, " out of:", n, (" ratio: %.6f" % (nsig / float(n)))
print "p-value: %.4g" % p
print "gc-male:", v.INFO["GC_Male"], " gc-female:", v.INFO["GC_Female"]
print "an-male:", v.INFO["AN_Male"], " an-female:", v.INFO["AN_Female"]
print "ac-male:", v.INFO["AC_Male"], " ac-female:", v.INFO["AC_Female"]
print "test: %d of %d with expected of %d/%d == %.4f" % (gcm[1], v.INFO["AN_Male"], gcf[1], v.INFO["AN_Female"], success_prob)
print "tested %d sites. p-value cutoff is: %.4g" % (n, 0.05 / n)
o.close()
print "\n".join("%s\t%d" % (gene, count) for gene, count in Counter(genes).most_common())
print len(set(genes))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment