brentp/README.md

## README.md

      
    Raw
  

              README.md
            
          
    fgbio -> annotated-filtered-vcf

run fgbio via netxflow:

echo "sample,fastq_1,fastq_2,read_structure" > sheet.csv
for f in /scratch/ucgd/lustre-work/quinlan/data-shared/datasets/spermseq/final_seq/Fastq/merged/*_R1*.fastq.gz; do
   # s=$(basename $f _R1.merged.fastq.gz)
   # R2=${f/_R1/_R2}
   # echo "$s,$f,$R2,\"$structure $structure\""
done >> sheet.csv
exit;
STRUCTURE

# The first value applies to the final consensus read, the second value to one single-strand consensus, and the last value to the other single-strand consensus. It is required that if values two and three differ, the more stringent value comes earlier.
export NXF_CONDA_ENABLED=true
nextflow run fastquorum/main.nf -with-trace -resume -profile quinlan_rw \
	-with-conda true \
	--enable_conda \
    --input sheet.csv \
    --outdir sperm-seq-fgbio-updated-params \
    --ref_fasta $ref38 \
    --duplex_seq \
	--groupreadsbyumi_edits 1 \
    --call_min_reads '3 3 3' \
    --call_min_baseq 20 \
    --filter_min_reads '3 3 3' \
    --filter_min_baseq 45 \
    --filter_max_base_error_rate 0.2 

set ends of reads to bq = 0

(see bq0.sh below)
bash bq0.sh

run per-base amd make vcf

(see perbase.sh below)
bash perbase.sh

run fraguracy on bq0 bams:

fraguracy extract -f $ref38 -o fraguracy-bq0/frag- -t bq0-bams/*.bam

vcfanno + slivar:

(see script below)
bash slivar-cmd.sh
python slivar-vcf-to-tsv.py sperm-seq.slivar.bq0.vcf.gz sperm-seq.bq0.tsv

output is: sperm-seq.slivar.bq0.vcf.gz and sperm-seq.slivar.counts.tsv and finally sperm-seq.bq0.tsv
Miscellaneous (not needed for above).

python check-in-fgbio-vcf.py mutation_dataset_04032023.txt /scratch/ucgd/lustre-work/quinlan/u6000771/src/sperm-seq/sperm-seq.perbase.vcf.gz > mutation_dataset_04032023.with-fgbio-counts.txt


python fahomopolymer.py -o - $ref38 | bgzip -c > hg38.homopolymers.bed.gz

create fgbio bams with ends of reads set to 0:


bash bq0.sh

create a groups file to be used by slivar:

python sperm-to-groups.py > sperm-seq-groups.txt


## bq0.sh
fn() {
	set -eu
	f=$1
	sample=$(basename $f .cons.filtered.bam)
	oodles base-qual0 --right-read1 15 --left-read2 20 $f bq0-bams/${sample}.bq0.bam
	samtools index bq0-bams/${sample}.bq0.bam
}
export -f fn

ls sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam | gargs -p 36 "fn {}"

## check-in-fgbio-vcf.py
import defopt
import sys
import cyvcf2
from pathlib import Path
import pandas as pd
import typing

def find_variant(d: dict, sample: str, vcf, samples: typing.List[str]) -> dict:

    for v in vcf(f'{d["chrom"]}:{d["start"]+1}-{d["end"]}'):
        if v.REF != d['ref']:
            raise Exception("Unexpected reference!")
        if v.ALT[0] != d['alt']: continue

        alts = v.format(d['alt'])
        si = samples.index(sample)
        alt_i = alts[si, 0]
        n_i = v.format("N")[si, 0]
        return(alt_i, n_i)
    else:
        print(f"NOT FOUND {d}", file=sys.stderr)
        return (0, 0)


def main(tsv: Path, vcf: Path):
    """
    add a column to the sperm-seq vcf containing counts from the fgbio vcf

    param tsv: original tsv
    param vcf: fgbio vcf

    """

    df = pd.read_csv(tsv, sep='\t')

    vcf = cyvcf2.VCF(vcf)
    samples = vcf.samples

    for i, (_, row) in enumerate(df.iterrows()):
        row = row.to_dict()
        if i == 0:
            print("\t".join(row.keys()) + "\tfgbio-N-count\tfgbio-alt-count")
        c = row['var_coord']
        chrom, rest = c.split(':')
        start, end, _ = rest.split("-")
        start, end = int(start), int(end)

        v = dict(chrom=chrom, start=start, end=end, ref=row['subtype'][0],
                 alt=row['subtype'][-1])

        alt_i, n_i = find_variant(v, row['sample'].split('.')[0], vcf, samples)
        row['fgbio-N-count'] = n_i
        row['fgbio-alt-count'] = alt_i

        print("\t".join(str(v) for v in row.values()))


if __name__ == "__main__":
    defopt.run(main)

## fahomopolymer.py
import sys
import defopt
import re
from pyfaidx import Fasta
from pathlib import Path


def main(fasta: Path, *, output_path: Path="homopolyers.bed", min_size: int=3):
    """
    create a bed file of homopolymer (length 3 or more) regions given a fasta file

    param fasta: path to fasta file
    param output_path: path to output bed file
    param min_size: minimum homopolymer length to include
    """
    assert min_size >= 3, "must specify homopolymer length greater than 2"
    p = re.compile(r'(A{%d,}|T{%d,}|G{%d,}|C{%d,})' % tuple([min_size]*4))

    f = Fasta(fasta, as_raw=True)

    out = sys.stdout if str(output_path) in ("-", "/dev/stdout") else open(output_path, "wt")

    for chrom in f.keys():
        seq = f[chrom][:]

        for m in p.finditer(seq):
            start, end = m.span()
            s = m.group()
            assert end - start == len(s)
            assert len(set(s)) == 1
            print(f'{chrom}\t{start}\t{end}\t{s[0]}\t{len(s)}', file=out)

    print(f'wrote to {output_path}', file=sys.stderr)

if __name__ == "__main__":
    defopt.run(main)

## perbase-to-vcf.py
import defopt
import typing
import pathlib
import itertools as it
import operator as op
import heapq
import gzip
import os
import sys

from dataclasses import dataclass, field


from typing import Any

@dataclass(order=False)
class PerBase:
    REF: str=field(compare=False)
    POS: int
    REF_BASE: str=field(compare=False)
    DEPTH: int=field(compare=False)
    A: int=field(compare=False)
    C: int=field(compare=False)
    G: int=field(compare=False)
    T: int=field(compare=False)
    N: int=field(compare=False)
    INS: int=field(compare=False)
    DEL: int=field(compare=False)
    REF_SKIP: int=field(compare=False)
    FAIL: int=field(compare=False)
    i: int=field(compare=False, default=0)
    tid: int=field(default=0) # compared!
    sample: str=field(compare=False, default="UNKNOWN")


    def __lt__(self, o):
        if self.tid != o.tid:
            return op.lt(self.tid, o.tid)
        return op.lt(self.POS, o.POS)

    def __gt__(self, o):
        if self.tid != o.tid:
            return op.gt(self.tid, o.tid)
        return op.gt(self.POS, o.POS)

def sample_name(path):

    n = os.path.basename(path)
    for ext in [".gz", ".txt", ".perbase"]:
        if n.endswith(ext):
            n = n[:-len(ext)]
    return n


def next_line(fh, header, int_fields):
    try:
        l = next(fh)
        d = dict(zip(header, l.strip().split("\t")))
        for f in int_fields:
            d[f] = int(d[f])
        return PerBase(**d)
    except StopIteration:
        return None


def lines(handles, chroms: typing.List[str]):
    int_fields = "DEPTH	A	C	G	T	N	INS	DEL	REF_SKIP	FAIL POS".split()

    chroms = {c: i for i, c in enumerate(chroms)}

    heap = []
    header = None
    for i, h in enumerate(handles):
        l = next(h)
        assert l[0] == 'R' # starts with REF
        header = l.strip().split("\t")
        header.remove("NEAR_MAX_DEPTH")
        d = next_line(h, header, int_fields)
        d.i = i
        d.tid = chroms[d.REF]
        d.sample = sample_name(h.name)
        heapq.heappush(heap, d)

    while len(heap) > 0:
        d = heapq.heappop(heap)
        yield d
        i, sample = d.i, d.sample

        d = next_line(handles[i], header, int_fields)
        if d is None: continue
        d.i, d.sample = i, sample
        d.tid = chroms[d.REF]
        heapq.heappush(heap, d)

vcf_header = """\
##fileformat=VCFv4.2
%s
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
##INFO=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
##INFO=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
##INFO=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
##INFO=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
##INFO=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
##INFO=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
##INFO=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
##INFO=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
##INFO=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
##FORMAT=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
##FORMAT=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
##FORMAT=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
##FORMAT=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
##FORMAT=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
##FORMAT=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
##FORMAT=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
##FORMAT=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
##FORMAT=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	"""


def main(*, fai: pathlib.Path, files:typing.List[pathlib.Path], min_depth:int=0, min_alt_depth:int=1):
    """

    :param list[pathlib.Path] files: perbase files output with -z (zero-based)
    :param int min_depth: only display sites where at least 1 sample has this depth
    :param int min_alt_depth: only display sites where at least 1 sample has this depth for a non-reference allele
    """
    handles = [gzip.open(f, 'rt') for f in files]
    samples = [sample_name(h.name) for h in handles]
    chroms = [l.strip().split("\t")[0] for l in open(fai)]
    header_chroms = ['##contig=<ID=%s>' % c for c in chroms]

    print(vcf_header % "\n".join(header_chroms), end="")
    print("\t".join(samples))

    all_alleles = set("ACTG")
    fields = ['A', 'C', 'T', 'G', 'N', 'REF_SKIP', 'INS', 'DEL', 'FAIL']

    for g, ls in it.groupby(lines(handles, chroms), op.attrgetter('REF', 'POS')):
        ls = list(ls)
        if min_depth > 0 and max(l.DEPTH for l in ls) < min_depth: continue
        alts = all_alleles - set(ls[0].REF_BASE)
        by_sample = {l.sample: l for l in ls}
        for alt in alts:
            if max(getattr(l, alt) for l in ls) < min_alt_depth: continue
            line = [ls[0].REF, ls[0].POS + 1, '.', ls[0].REF_BASE, alt, 10, "."]
            info = {}
            fmt = {}
            for sample in samples:
                if not sample in by_sample:
                    fmt[sample] = ":".join(['.'] * len(fields))
                else:
                    fmt[sample] = ":".join([str(getattr(by_sample[sample], f)) for f in fields])
                    for field in fields:
                        if not field in info: info[field] = 0
                        info[field] += getattr(by_sample[sample], field)

            line.append(";".join(f"{n}={v}" for n, v in info.items()))
            line.append(":".join(fields))
            line.extend(fmt[s] for s in samples)
            print("\t".join(map(str, line)))

if __name__ == '__main__':
    defopt.run(main)

## perbase.sh
set -euo pipefail
mkdir -p perbase-bq0

#for bam in sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam; do
#   sample=$(basename $bam .cons.filtered.bam)
for bam in bq0-bams/*.bam; do
   sample=$(basename $bam .bq0.bam)
   o=perbase-bq0/$sample.perbase.txt.gz
   time perbase base-depth -Z -M -z -f 2 -Q 35 -q 60 -r $ref38 $bam -o $o
   tabix -f --zero-based -S 1 -s 1 -b 2 -e 2 $o
done
python perbase-to-vcf.py --fai ${ref38}.fai --files perbase-bq0/*.txt.gz --min-depth 200 \
    | bgzip -c > sperm-seq.perbase.bq0.vcf.gz && bcftools index sperm-seq.perbase.bq0.vcf.gz

## slivar-cmd.sh
echo '
[[annotation]]
file="fraguracy-bq0/frag-total-errors.bed.gz"
columns=[5]
names=["frag_errors"]
ops=["max"]
' > frag-conf.toml

#slivar expr \
export SLIVAR_SUMMARY_FILE=sperm-seq.slivar.counts.tsv
vcfanno frag-conf.toml sperm-seq.perbase.bq0.vcf.gz 2>vcfanno.err \
 | ../slivar/src/slivar expr \
    --js sperm-slivar.js \
    -g ../gnomad.hg38.genomes.v3.fix.zip \
    --pass-only \
    -v /dev/stdin \
    --info 'INFO.N / (INFO.A + INFO.T + INFO.C + INFO.G + INFO.N) < 0.05 && (!("frag_errors" in INFO) || INFO.frag_errors < 5)' \
    --sample-expr 'inherited:inherited(sample, variant.ALT[0])' \
    --sample-expr 'gonosomal:gonosomal(sample, variant.ALT[0])' \
    --sample-expr 'clonal:clonal(sample, variant.ALT[0])' \
    --sample-expr 'nonclonal:nonclonal(sample, variant.ALT[0])' \
    -o /dev/stdout \
 | SLIVAR_SUMMARY_FILE=sperm-seq.slivar.group.counts.tsv ../slivar/src/slivar expr \
    -v /dev/stdin \
    --alias sperm-seq-full-groups.txt \
    --js sperm-slivar.js \
    --group-expr 'sperm_only_inherited:inherited(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
    --group-expr 'sperm_only_gonosomal:gonosomal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
    --group-expr 'sperm_only_clonal:clonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
    --group-expr 'sperm_only_nonclonal:nonclonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
    -o sperm-seq.slivar.bq0.vcf.gz


## slivar-vcf-to-tsv.py
import cyvcf2
import defopt
import numpy as np

def main(vcf: str, outpath: str):
    """
    :param vcf: path to vcf annotated with slivar
    :param output: path to output TSV
    """

    vcf = cyvcf2.VCF(vcf)
    ofh = open(outpath, 'w')

    vcf_samples = {s: i for i, s in enumerate(vcf.samples)}

    fields = ['inherited', 'gonosomal', 'clonal', 'nonclonal',
    'sperm_only_inherited', 'sperm_only_gonosomal', 'sperm_only_clonal',
    'sperm_only_nonclonal']

    print('variant\tsample\tclass\tgnomad_af\tgnomad_filtered\tAF\talt_count\tref_count\tN_count\tfraguracy_errors', file=ofh)
    for v in vcf:
        for f in fields:
            samples = v.INFO.get(f)
            if samples is None: continue
            samples = samples.split(',')
            gnomad_af = v.INFO.get('gnomad_popmax_af', -1)
            frag_errs = v.INFO.get('frag_errors', 0)
            gnomad_filter = str('gnomad_popmax_af_filter' in v.INFO).lower()
            alts = np.maximum(0, v.format(v.ALT[0])[:, 0])
            refs = np.maximum(0, v.format(v.REF)[:, 0])
            Ns = v.format('N')[:, 0]
            for sample in samples:
                sample_idx = vcf_samples[sample]
                ref_count, alt_count, n_count = refs[sample_idx], alts[sample_idx], Ns[sample_idx]
                af = alt_count / np.maximum(1, alt_count + ref_count)
                print(f'{v.CHROM}:{v.POS}:{v.REF}:{v.ALT[0]}\t{sample}\t{f}\t{gnomad_af:.4g}\t{gnomad_filter}\t{af:.5g}\t{alt_count}\t{ref_count}\t{n_count}\t{frag_errs}', file=ofh)


if __name__ == "__main__":
    defopt.run(main)

## sperm-slivar.js
function AF(sample, alt) {
	var alt_count = Math.max(0, sample[alt])
	var tot_count = Math.max(0, sample.A) + Math.max(0, sample.C) + Math.max(0, sample.G) + Math.max(0, sample.T) // + sample.N
	return alt_count / tot_count;
}

function inherited(sample, alt) {
    return AF(sample, alt) >= 0.2
}

function gonosomal(sample, alt) {
	return AF(sample, alt) >= 0.05 && AF(sample, alt) < 0.2
}

function clonal(sample, alt) {
	return AF(sample, alt) < 0.05 && sample[alt] > 1
}

function nonclonal(sample, alt) {
	return AF(sample, alt) < 0.05 && sample[alt] == 1
}

function absent_in(sample, alt) {
	return sample[alt] == 0
}

## sperm-to-groups.py
import pandas as pd
import sys

df = pd.read_csv('Spermseq_Sample_Tracker_Twinstrand.csv')
print('#sperm\tblood')
for g, rows in df.groupby('Donor_ID'):

    # drop .1 suffix
    samples = [x[:-2] for x in rows['Sample_ID']]
    tissues = list(rows['Tissue'])
    d = dict(zip(tissues, samples))
    print(f'{d.get("Sperm_DNA", "")}\t{d.get("Blood_DNA", "")}')

## sperm_not_blood_in_other_samples.py
import sys
import pandas as pd


df = pd.read_csv(sys.argv[1], sep="\t")

sub = df[ (df.alt_count_v2 > 1) & (df.tissue == "Sperm_DNA") &
         (df.num_donors_blood > 0)
         & ((df.inherited_blood_donor_ct > 0) | (df.inherited_sperm_and_blood_donor_ct))
         & (df.error_prone_status != "error_prone_site") #&
         & (df.var_filter == "PASS")
         & (df['fgbio-alt-count'] > 0)
         ]
print(sub.shape, file=sys.stderr)

print("sample\tfilter\tcount")
for (sample, filter), f in sub.groupby(['sample', 'var_filter']):
    n_vars = len(f['var_coord'].unique())
    print(f'{sample}\t{filter}\t{n_vars}')

sub.to_csv("filt.tsv", sep="\t", index=False)
	fn() {
	set -eu
	f=$1
	sample=$(basename $f .cons.filtered.bam)
	oodles base-qual0 --right-read1 15 --left-read2 20 $f bq0-bams/${sample}.bq0.bam
	samtools index bq0-bams/${sample}.bq0.bam
	}
	export -f fn

	ls sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam \| gargs -p 36 "fn {}"
	import defopt
	import sys
	import cyvcf2
	from pathlib import Path
	import pandas as pd
	import typing

	def find_variant(d: dict, sample: str, vcf, samples: typing.List[str]) -> dict:

	for v in vcf(f'{d["chrom"]}:{d["start"]+1}-{d["end"]}'):
	if v.REF != d['ref']:
	raise Exception("Unexpected reference!")
	if v.ALT[0] != d['alt']: continue

	alts = v.format(d['alt'])
	si = samples.index(sample)
	alt_i = alts[si, 0]
	n_i = v.format("N")[si, 0]
	return(alt_i, n_i)
	else:
	print(f"NOT FOUND {d}", file=sys.stderr)
	return (0, 0)



	def main(tsv: Path, vcf: Path):
	"""
	add a column to the sperm-seq vcf containing counts from the fgbio vcf

	param tsv: original tsv
	param vcf: fgbio vcf

	"""

	df = pd.read_csv(tsv, sep='\t')

	vcf = cyvcf2.VCF(vcf)
	samples = vcf.samples

	for i, (_, row) in enumerate(df.iterrows()):
	row = row.to_dict()
	if i == 0:
	print("\t".join(row.keys()) + "\tfgbio-N-count\tfgbio-alt-count")
	c = row['var_coord']
	chrom, rest = c.split(':')
	start, end, _ = rest.split("-")
	start, end = int(start), int(end)

	v = dict(chrom=chrom, start=start, end=end, ref=row['subtype'][0],
	alt=row['subtype'][-1])

	alt_i, n_i = find_variant(v, row['sample'].split('.')[0], vcf, samples)
	row['fgbio-N-count'] = n_i
	row['fgbio-alt-count'] = alt_i

	print("\t".join(str(v) for v in row.values()))




	if __name__ == "__main__":
	defopt.run(main)
	import sys
	import defopt
	import re
	from pyfaidx import Fasta
	from pathlib import Path


	def main(fasta: Path, *, output_path: Path="homopolyers.bed", min_size: int=3):
	"""
	create a bed file of homopolymer (length 3 or more) regions given a fasta file

	param fasta: path to fasta file
	param output_path: path to output bed file
	param min_size: minimum homopolymer length to include
	"""
	assert min_size >= 3, "must specify homopolymer length greater than 2"
	p = re.compile(r'(A{%d,}\|T{%d,}\|G{%d,}\|C{%d,})' % tuple([min_size]*4))

	f = Fasta(fasta, as_raw=True)

	out = sys.stdout if str(output_path) in ("-", "/dev/stdout") else open(output_path, "wt")

	for chrom in f.keys():
	seq = f[chrom][:]

	for m in p.finditer(seq):
	start, end = m.span()
	s = m.group()
	assert end - start == len(s)
	assert len(set(s)) == 1
	print(f'{chrom}\t{start}\t{end}\t{s[0]}\t{len(s)}', file=out)

	print(f'wrote to {output_path}', file=sys.stderr)

	if __name__ == "__main__":
	defopt.run(main)
	import defopt
	import typing
	import pathlib
	import itertools as it
	import operator as op
	import heapq
	import gzip
	import os
	import sys

	from dataclasses import dataclass, field


	from typing import Any

	@dataclass(order=False)
	class PerBase:
	REF: str=field(compare=False)
	POS: int
	REF_BASE: str=field(compare=False)
	DEPTH: int=field(compare=False)
	A: int=field(compare=False)
	C: int=field(compare=False)
	G: int=field(compare=False)
	T: int=field(compare=False)
	N: int=field(compare=False)
	INS: int=field(compare=False)
	DEL: int=field(compare=False)
	REF_SKIP: int=field(compare=False)
	FAIL: int=field(compare=False)
	i: int=field(compare=False, default=0)
	tid: int=field(default=0) # compared!
	sample: str=field(compare=False, default="UNKNOWN")


	def __lt__(self, o):
	if self.tid != o.tid:
	return op.lt(self.tid, o.tid)
	return op.lt(self.POS, o.POS)

	def __gt__(self, o):
	if self.tid != o.tid:
	return op.gt(self.tid, o.tid)
	return op.gt(self.POS, o.POS)

	def sample_name(path):

	n = os.path.basename(path)
	for ext in [".gz", ".txt", ".perbase"]:
	if n.endswith(ext):
	n = n[:-len(ext)]
	return n


	def next_line(fh, header, int_fields):
	try:
	l = next(fh)
	d = dict(zip(header, l.strip().split("\t")))
	for f in int_fields:
	d[f] = int(d[f])
	return PerBase(**d)
	except StopIteration:
	return None


	def lines(handles, chroms: typing.List[str]):
	int_fields = "DEPTH A C G T N INS DEL REF_SKIP FAIL POS".split()

	chroms = {c: i for i, c in enumerate(chroms)}

	heap = []
	header = None
	for i, h in enumerate(handles):
	l = next(h)
	assert l[0] == 'R' # starts with REF
	header = l.strip().split("\t")
	header.remove("NEAR_MAX_DEPTH")
	d = next_line(h, header, int_fields)
	d.i = i
	d.tid = chroms[d.REF]
	d.sample = sample_name(h.name)
	heapq.heappush(heap, d)

	while len(heap) > 0:
	d = heapq.heappop(heap)
	yield d
	i, sample = d.i, d.sample

	d = next_line(handles[i], header, int_fields)
	if d is None: continue
	d.i, d.sample = i, sample
	d.tid = chroms[d.REF]
	heapq.heappush(heap, d)

	vcf_header = """\
	##fileformat=VCFv4.2
	%s
	##FILTER=<ID=PASS,Description="All filters passed">
	##INFO=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
	##INFO=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
	##INFO=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
	##INFO=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
	##INFO=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
	##INFO=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
	##INFO=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
	##INFO=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
	##INFO=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
	##INFO=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
	##FORMAT=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
	##FORMAT=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
	##FORMAT=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
	##FORMAT=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
	##FORMAT=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
	##FORMAT=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
	##FORMAT=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
	##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
	##FORMAT=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
	##FORMAT=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
	#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT """


	def main(*, fai: pathlib.Path, files:typing.List[pathlib.Path], min_depth:int=0, min_alt_depth:int=1):
	"""

	:param list[pathlib.Path] files: perbase files output with -z (zero-based)
	:param int min_depth: only display sites where at least 1 sample has this depth
	:param int min_alt_depth: only display sites where at least 1 sample has this depth for a non-reference allele
	"""
	handles = [gzip.open(f, 'rt') for f in files]
	samples = [sample_name(h.name) for h in handles]
	chroms = [l.strip().split("\t")[0] for l in open(fai)]
	header_chroms = ['##contig=<ID=%s>' % c for c in chroms]

	print(vcf_header % "\n".join(header_chroms), end="")
	print("\t".join(samples))

	all_alleles = set("ACTG")
	fields = ['A', 'C', 'T', 'G', 'N', 'REF_SKIP', 'INS', 'DEL', 'FAIL']

	for g, ls in it.groupby(lines(handles, chroms), op.attrgetter('REF', 'POS')):
	ls = list(ls)
	if min_depth > 0 and max(l.DEPTH for l in ls) < min_depth: continue
	alts = all_alleles - set(ls[0].REF_BASE)
	by_sample = {l.sample: l for l in ls}
	for alt in alts:
	if max(getattr(l, alt) for l in ls) < min_alt_depth: continue
	line = [ls[0].REF, ls[0].POS + 1, '.', ls[0].REF_BASE, alt, 10, "."]
	info = {}
	fmt = {}
	for sample in samples:
	if not sample in by_sample:
	fmt[sample] = ":".join(['.'] * len(fields))
	else:
	fmt[sample] = ":".join([str(getattr(by_sample[sample], f)) for f in fields])
	for field in fields:
	if not field in info: info[field] = 0
	info[field] += getattr(by_sample[sample], field)

	line.append(";".join(f"{n}={v}" for n, v in info.items()))
	line.append(":".join(fields))
	line.extend(fmt[s] for s in samples)
	print("\t".join(map(str, line)))

	if __name__ == '__main__':
	defopt.run(main)
	set -euo pipefail
	mkdir -p perbase-bq0

	#for bam in sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam; do
	# sample=$(basename $bam .cons.filtered.bam)
	for bam in bq0-bams/*.bam; do
	sample=$(basename $bam .bq0.bam)
	o=perbase-bq0/$sample.perbase.txt.gz
	time perbase base-depth -Z -M -z -f 2 -Q 35 -q 60 -r $ref38 $bam -o $o
	tabix -f --zero-based -S 1 -s 1 -b 2 -e 2 $o
	done
	python perbase-to-vcf.py --fai ${ref38}.fai --files perbase-bq0/*.txt.gz --min-depth 200 \
	\| bgzip -c > sperm-seq.perbase.bq0.vcf.gz && bcftools index sperm-seq.perbase.bq0.vcf.gz
	echo '
	[[annotation]]
	file="fraguracy-bq0/frag-total-errors.bed.gz"
	columns=[5]
	names=["frag_errors"]
	ops=["max"]
	' > frag-conf.toml

	#slivar expr \
	export SLIVAR_SUMMARY_FILE=sperm-seq.slivar.counts.tsv
	vcfanno frag-conf.toml sperm-seq.perbase.bq0.vcf.gz 2>vcfanno.err \
	\| ../slivar/src/slivar expr \
	--js sperm-slivar.js \
	-g ../gnomad.hg38.genomes.v3.fix.zip \
	--pass-only \
	-v /dev/stdin \
	--info 'INFO.N / (INFO.A + INFO.T + INFO.C + INFO.G + INFO.N) < 0.05 && (!("frag_errors" in INFO) \|\| INFO.frag_errors < 5)' \
	--sample-expr 'inherited:inherited(sample, variant.ALT[0])' \
	--sample-expr 'gonosomal:gonosomal(sample, variant.ALT[0])' \
	--sample-expr 'clonal:clonal(sample, variant.ALT[0])' \
	--sample-expr 'nonclonal:nonclonal(sample, variant.ALT[0])' \
	-o /dev/stdout \
	\| SLIVAR_SUMMARY_FILE=sperm-seq.slivar.group.counts.tsv ../slivar/src/slivar expr \
	-v /dev/stdin \
	--alias sperm-seq-full-groups.txt \
	--js sperm-slivar.js \
	--group-expr 'sperm_only_inherited:inherited(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
	--group-expr 'sperm_only_gonosomal:gonosomal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
	--group-expr 'sperm_only_clonal:clonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
	--group-expr 'sperm_only_nonclonal:nonclonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
	-o sperm-seq.slivar.bq0.vcf.gz
	import cyvcf2
	import defopt
	import numpy as np

	def main(vcf: str, outpath: str):
	"""
	:param vcf: path to vcf annotated with slivar
	:param output: path to output TSV
	"""

	vcf = cyvcf2.VCF(vcf)
	ofh = open(outpath, 'w')

	vcf_samples = {s: i for i, s in enumerate(vcf.samples)}

	fields = ['inherited', 'gonosomal', 'clonal', 'nonclonal',
	'sperm_only_inherited', 'sperm_only_gonosomal', 'sperm_only_clonal',
	'sperm_only_nonclonal']

	print('variant\tsample\tclass\tgnomad_af\tgnomad_filtered\tAF\talt_count\tref_count\tN_count\tfraguracy_errors', file=ofh)
	for v in vcf:
	for f in fields:
	samples = v.INFO.get(f)
	if samples is None: continue
	samples = samples.split(',')
	gnomad_af = v.INFO.get('gnomad_popmax_af', -1)
	frag_errs = v.INFO.get('frag_errors', 0)
	gnomad_filter = str('gnomad_popmax_af_filter' in v.INFO).lower()
	alts = np.maximum(0, v.format(v.ALT[0])[:, 0])
	refs = np.maximum(0, v.format(v.REF)[:, 0])
	Ns = v.format('N')[:, 0]
	for sample in samples:
	sample_idx = vcf_samples[sample]
	ref_count, alt_count, n_count = refs[sample_idx], alts[sample_idx], Ns[sample_idx]
	af = alt_count / np.maximum(1, alt_count + ref_count)
	print(f'{v.CHROM}:{v.POS}:{v.REF}:{v.ALT[0]}\t{sample}\t{f}\t{gnomad_af:.4g}\t{gnomad_filter}\t{af:.5g}\t{alt_count}\t{ref_count}\t{n_count}\t{frag_errs}', file=ofh)


	if __name__ == "__main__":
	defopt.run(main)
	function AF(sample, alt) {
	var alt_count = Math.max(0, sample[alt])
	var tot_count = Math.max(0, sample.A) + Math.max(0, sample.C) + Math.max(0, sample.G) + Math.max(0, sample.T) // + sample.N
	return alt_count / tot_count;
	}

	function inherited(sample, alt) {
	return AF(sample, alt) >= 0.2
	}

	function gonosomal(sample, alt) {
	return AF(sample, alt) >= 0.05 && AF(sample, alt) < 0.2
	}

	function clonal(sample, alt) {
	return AF(sample, alt) < 0.05 && sample[alt] > 1
	}

	function nonclonal(sample, alt) {
	return AF(sample, alt) < 0.05 && sample[alt] == 1
	}

	function absent_in(sample, alt) {
	return sample[alt] == 0
	}
	import pandas as pd
	import sys

	df = pd.read_csv('Spermseq_Sample_Tracker_Twinstrand.csv')
	print('#sperm\tblood')
	for g, rows in df.groupby('Donor_ID'):

	# drop .1 suffix
	samples = [x[:-2] for x in rows['Sample_ID']]
	tissues = list(rows['Tissue'])
	d = dict(zip(tissues, samples))
	print(f'{d.get("Sperm_DNA", "")}\t{d.get("Blood_DNA", "")}')
	import sys
	import pandas as pd


	df = pd.read_csv(sys.argv[1], sep="\t")

	sub = df[ (df.alt_count_v2 > 1) & (df.tissue == "Sperm_DNA") &
	(df.num_donors_blood > 0)
	& ((df.inherited_blood_donor_ct > 0) \| (df.inherited_sperm_and_blood_donor_ct))
	& (df.error_prone_status != "error_prone_site") #&
	& (df.var_filter == "PASS")
	& (df['fgbio-alt-count'] > 0)
	]
	print(sub.shape, file=sys.stderr)

	print("sample\tfilter\tcount")
	for (sample, filter), f in sub.groupby(['sample', 'var_filter']):
	n_vars = len(f['var_coord'].unique())
	print(f'{sample}\t{filter}\t{n_vars}')

	sub.to_csv("filt.tsv", sep="\t", index=False)