Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active April 17, 2023 20:59
Show Gist options
  • Save brentp/90805896da53bb22b9b4c7b021f72c32 to your computer and use it in GitHub Desktop.
Save brentp/90805896da53bb22b9b4c7b021f72c32 to your computer and use it in GitHub Desktop.

fgbio -> annotated-filtered-vcf

run fgbio via netxflow:

echo "sample,fastq_1,fastq_2,read_structure" > sheet.csv
for f in /scratch/ucgd/lustre-work/quinlan/data-shared/datasets/spermseq/final_seq/Fastq/merged/*_R1*.fastq.gz; do
   # s=$(basename $f _R1.merged.fastq.gz)
   # R2=${f/_R1/_R2}
   # echo "$s,$f,$R2,\"$structure $structure\""
done >> sheet.csv
exit;
STRUCTURE

# The first value applies to the final consensus read, the second value to one single-strand consensus, and the last value to the other single-strand consensus. It is required that if values two and three differ, the more stringent value comes earlier.
export NXF_CONDA_ENABLED=true
nextflow run fastquorum/main.nf -with-trace -resume -profile quinlan_rw \
	-with-conda true \
	--enable_conda \
    --input sheet.csv \
    --outdir sperm-seq-fgbio-updated-params \
    --ref_fasta $ref38 \
    --duplex_seq \
	--groupreadsbyumi_edits 1 \
    --call_min_reads '3 3 3' \
    --call_min_baseq 20 \
    --filter_min_reads '3 3 3' \
    --filter_min_baseq 45 \
    --filter_max_base_error_rate 0.2 

set ends of reads to bq = 0

(see bq0.sh below)

bash bq0.sh

run per-base amd make vcf

(see perbase.sh below)

bash perbase.sh

run fraguracy on bq0 bams:

fraguracy extract -f $ref38 -o fraguracy-bq0/frag- -t bq0-bams/*.bam

vcfanno + slivar:

(see script below)

bash slivar-cmd.sh
python slivar-vcf-to-tsv.py sperm-seq.slivar.bq0.vcf.gz sperm-seq.bq0.tsv

output is: sperm-seq.slivar.bq0.vcf.gz and sperm-seq.slivar.counts.tsv and finally sperm-seq.bq0.tsv

Miscellaneous (not needed for above).

python check-in-fgbio-vcf.py mutation_dataset_04032023.txt /scratch/ucgd/lustre-work/quinlan/u6000771/src/sperm-seq/sperm-seq.perbase.vcf.gz > mutation_dataset_04032023.with-fgbio-counts.txt


python fahomopolymer.py -o - $ref38 | bgzip -c > hg38.homopolymers.bed.gz


create fgbio bams with ends of reads set to 0:

bash bq0.sh


create a groups file to be used by slivar:

python sperm-to-groups.py > sperm-seq-groups.txt

fn() {
set -eu
f=$1
sample=$(basename $f .cons.filtered.bam)
oodles base-qual0 --right-read1 15 --left-read2 20 $f bq0-bams/${sample}.bq0.bam
samtools index bq0-bams/${sample}.bq0.bam
}
export -f fn
ls sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam | gargs -p 36 "fn {}"
import defopt
import sys
import cyvcf2
from pathlib import Path
import pandas as pd
import typing
def find_variant(d: dict, sample: str, vcf, samples: typing.List[str]) -> dict:
for v in vcf(f'{d["chrom"]}:{d["start"]+1}-{d["end"]}'):
if v.REF != d['ref']:
raise Exception("Unexpected reference!")
if v.ALT[0] != d['alt']: continue
alts = v.format(d['alt'])
si = samples.index(sample)
alt_i = alts[si, 0]
n_i = v.format("N")[si, 0]
return(alt_i, n_i)
else:
print(f"NOT FOUND {d}", file=sys.stderr)
return (0, 0)
def main(tsv: Path, vcf: Path):
"""
add a column to the sperm-seq vcf containing counts from the fgbio vcf
param tsv: original tsv
param vcf: fgbio vcf
"""
df = pd.read_csv(tsv, sep='\t')
vcf = cyvcf2.VCF(vcf)
samples = vcf.samples
for i, (_, row) in enumerate(df.iterrows()):
row = row.to_dict()
if i == 0:
print("\t".join(row.keys()) + "\tfgbio-N-count\tfgbio-alt-count")
c = row['var_coord']
chrom, rest = c.split(':')
start, end, _ = rest.split("-")
start, end = int(start), int(end)
v = dict(chrom=chrom, start=start, end=end, ref=row['subtype'][0],
alt=row['subtype'][-1])
alt_i, n_i = find_variant(v, row['sample'].split('.')[0], vcf, samples)
row['fgbio-N-count'] = n_i
row['fgbio-alt-count'] = alt_i
print("\t".join(str(v) for v in row.values()))
if __name__ == "__main__":
defopt.run(main)
import sys
import defopt
import re
from pyfaidx import Fasta
from pathlib import Path
def main(fasta: Path, *, output_path: Path="homopolyers.bed", min_size: int=3):
"""
create a bed file of homopolymer (length 3 or more) regions given a fasta file
param fasta: path to fasta file
param output_path: path to output bed file
param min_size: minimum homopolymer length to include
"""
assert min_size >= 3, "must specify homopolymer length greater than 2"
p = re.compile(r'(A{%d,}|T{%d,}|G{%d,}|C{%d,})' % tuple([min_size]*4))
f = Fasta(fasta, as_raw=True)
out = sys.stdout if str(output_path) in ("-", "/dev/stdout") else open(output_path, "wt")
for chrom in f.keys():
seq = f[chrom][:]
for m in p.finditer(seq):
start, end = m.span()
s = m.group()
assert end - start == len(s)
assert len(set(s)) == 1
print(f'{chrom}\t{start}\t{end}\t{s[0]}\t{len(s)}', file=out)
print(f'wrote to {output_path}', file=sys.stderr)
if __name__ == "__main__":
defopt.run(main)
import defopt
import typing
import pathlib
import itertools as it
import operator as op
import heapq
import gzip
import os
import sys
from dataclasses import dataclass, field
from typing import Any
@dataclass(order=False)
class PerBase:
REF: str=field(compare=False)
POS: int
REF_BASE: str=field(compare=False)
DEPTH: int=field(compare=False)
A: int=field(compare=False)
C: int=field(compare=False)
G: int=field(compare=False)
T: int=field(compare=False)
N: int=field(compare=False)
INS: int=field(compare=False)
DEL: int=field(compare=False)
REF_SKIP: int=field(compare=False)
FAIL: int=field(compare=False)
i: int=field(compare=False, default=0)
tid: int=field(default=0) # compared!
sample: str=field(compare=False, default="UNKNOWN")
def __lt__(self, o):
if self.tid != o.tid:
return op.lt(self.tid, o.tid)
return op.lt(self.POS, o.POS)
def __gt__(self, o):
if self.tid != o.tid:
return op.gt(self.tid, o.tid)
return op.gt(self.POS, o.POS)
def sample_name(path):
n = os.path.basename(path)
for ext in [".gz", ".txt", ".perbase"]:
if n.endswith(ext):
n = n[:-len(ext)]
return n
def next_line(fh, header, int_fields):
try:
l = next(fh)
d = dict(zip(header, l.strip().split("\t")))
for f in int_fields:
d[f] = int(d[f])
return PerBase(**d)
except StopIteration:
return None
def lines(handles, chroms: typing.List[str]):
int_fields = "DEPTH A C G T N INS DEL REF_SKIP FAIL POS".split()
chroms = {c: i for i, c in enumerate(chroms)}
heap = []
header = None
for i, h in enumerate(handles):
l = next(h)
assert l[0] == 'R' # starts with REF
header = l.strip().split("\t")
header.remove("NEAR_MAX_DEPTH")
d = next_line(h, header, int_fields)
d.i = i
d.tid = chroms[d.REF]
d.sample = sample_name(h.name)
heapq.heappush(heap, d)
while len(heap) > 0:
d = heapq.heappop(heap)
yield d
i, sample = d.i, d.sample
d = next_line(handles[i], header, int_fields)
if d is None: continue
d.i, d.sample = i, sample
d.tid = chroms[d.REF]
heapq.heappush(heap, d)
vcf_header = """\
##fileformat=VCFv4.2
%s
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
##INFO=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
##INFO=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
##INFO=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
##INFO=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
##INFO=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
##INFO=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
##INFO=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
##INFO=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
##INFO=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
##FORMAT=<ID=A,Number=1,Type=Integer,Description="count of A's at this site">
##FORMAT=<ID=C,Number=1,Type=Integer,Description="count of C's at this site">
##FORMAT=<ID=T,Number=1,Type=Integer,Description="count of T's at this site">
##FORMAT=<ID=G,Number=1,Type=Integer,Description="count of G's at this site">
##FORMAT=<ID=N,Number=1,Type=Integer,Description="count of G's at this site">
##FORMAT=<ID=REF_SKIP,Number=1,Type=Integer,Description="count of REF_SKIP's at this site">
##FORMAT=<ID=INS,Number=1,Type=Integer,Description="count of INS's at this site">
##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="count of DEL's at this site">
##FORMAT=<ID=FAIL,Number=1,Type=Integer,Description="count of FAIL's at this site">
##FORMAT=<ID=DEPTH,Number=1,Type=Integer,Description="DEPTH at this site">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT """
def main(*, fai: pathlib.Path, files:typing.List[pathlib.Path], min_depth:int=0, min_alt_depth:int=1):
"""
:param list[pathlib.Path] files: perbase files output with -z (zero-based)
:param int min_depth: only display sites where at least 1 sample has this depth
:param int min_alt_depth: only display sites where at least 1 sample has this depth for a non-reference allele
"""
handles = [gzip.open(f, 'rt') for f in files]
samples = [sample_name(h.name) for h in handles]
chroms = [l.strip().split("\t")[0] for l in open(fai)]
header_chroms = ['##contig=<ID=%s>' % c for c in chroms]
print(vcf_header % "\n".join(header_chroms), end="")
print("\t".join(samples))
all_alleles = set("ACTG")
fields = ['A', 'C', 'T', 'G', 'N', 'REF_SKIP', 'INS', 'DEL', 'FAIL']
for g, ls in it.groupby(lines(handles, chroms), op.attrgetter('REF', 'POS')):
ls = list(ls)
if min_depth > 0 and max(l.DEPTH for l in ls) < min_depth: continue
alts = all_alleles - set(ls[0].REF_BASE)
by_sample = {l.sample: l for l in ls}
for alt in alts:
if max(getattr(l, alt) for l in ls) < min_alt_depth: continue
line = [ls[0].REF, ls[0].POS + 1, '.', ls[0].REF_BASE, alt, 10, "."]
info = {}
fmt = {}
for sample in samples:
if not sample in by_sample:
fmt[sample] = ":".join(['.'] * len(fields))
else:
fmt[sample] = ":".join([str(getattr(by_sample[sample], f)) for f in fields])
for field in fields:
if not field in info: info[field] = 0
info[field] += getattr(by_sample[sample], field)
line.append(";".join(f"{n}={v}" for n, v in info.items()))
line.append(":".join(fields))
line.extend(fmt[s] for s in samples)
print("\t".join(map(str, line)))
if __name__ == '__main__':
defopt.run(main)
set -euo pipefail
mkdir -p perbase-bq0
#for bam in sperm-seq-fgbio-updated-params/filterconsensusreads/*.bam; do
# sample=$(basename $bam .cons.filtered.bam)
for bam in bq0-bams/*.bam; do
sample=$(basename $bam .bq0.bam)
o=perbase-bq0/$sample.perbase.txt.gz
time perbase base-depth -Z -M -z -f 2 -Q 35 -q 60 -r $ref38 $bam -o $o
tabix -f --zero-based -S 1 -s 1 -b 2 -e 2 $o
done
python perbase-to-vcf.py --fai ${ref38}.fai --files perbase-bq0/*.txt.gz --min-depth 200 \
| bgzip -c > sperm-seq.perbase.bq0.vcf.gz && bcftools index sperm-seq.perbase.bq0.vcf.gz
echo '
[[annotation]]
file="fraguracy-bq0/frag-total-errors.bed.gz"
columns=[5]
names=["frag_errors"]
ops=["max"]
' > frag-conf.toml
#slivar expr \
export SLIVAR_SUMMARY_FILE=sperm-seq.slivar.counts.tsv
vcfanno frag-conf.toml sperm-seq.perbase.bq0.vcf.gz 2>vcfanno.err \
| ../slivar/src/slivar expr \
--js sperm-slivar.js \
-g ../gnomad.hg38.genomes.v3.fix.zip \
--pass-only \
-v /dev/stdin \
--info 'INFO.N / (INFO.A + INFO.T + INFO.C + INFO.G + INFO.N) < 0.05 && (!("frag_errors" in INFO) || INFO.frag_errors < 5)' \
--sample-expr 'inherited:inherited(sample, variant.ALT[0])' \
--sample-expr 'gonosomal:gonosomal(sample, variant.ALT[0])' \
--sample-expr 'clonal:clonal(sample, variant.ALT[0])' \
--sample-expr 'nonclonal:nonclonal(sample, variant.ALT[0])' \
-o /dev/stdout \
| SLIVAR_SUMMARY_FILE=sperm-seq.slivar.group.counts.tsv ../slivar/src/slivar expr \
-v /dev/stdin \
--alias sperm-seq-full-groups.txt \
--js sperm-slivar.js \
--group-expr 'sperm_only_inherited:inherited(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
--group-expr 'sperm_only_gonosomal:gonosomal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
--group-expr 'sperm_only_clonal:clonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
--group-expr 'sperm_only_nonclonal:nonclonal(sperm, variant.ALT[0]) && absent_in(blood, variant.ALT[0])' \
-o sperm-seq.slivar.bq0.vcf.gz
import cyvcf2
import defopt
import numpy as np
def main(vcf: str, outpath: str):
"""
:param vcf: path to vcf annotated with slivar
:param output: path to output TSV
"""
vcf = cyvcf2.VCF(vcf)
ofh = open(outpath, 'w')
vcf_samples = {s: i for i, s in enumerate(vcf.samples)}
fields = ['inherited', 'gonosomal', 'clonal', 'nonclonal',
'sperm_only_inherited', 'sperm_only_gonosomal', 'sperm_only_clonal',
'sperm_only_nonclonal']
print('variant\tsample\tclass\tgnomad_af\tgnomad_filtered\tAF\talt_count\tref_count\tN_count\tfraguracy_errors', file=ofh)
for v in vcf:
for f in fields:
samples = v.INFO.get(f)
if samples is None: continue
samples = samples.split(',')
gnomad_af = v.INFO.get('gnomad_popmax_af', -1)
frag_errs = v.INFO.get('frag_errors', 0)
gnomad_filter = str('gnomad_popmax_af_filter' in v.INFO).lower()
alts = np.maximum(0, v.format(v.ALT[0])[:, 0])
refs = np.maximum(0, v.format(v.REF)[:, 0])
Ns = v.format('N')[:, 0]
for sample in samples:
sample_idx = vcf_samples[sample]
ref_count, alt_count, n_count = refs[sample_idx], alts[sample_idx], Ns[sample_idx]
af = alt_count / np.maximum(1, alt_count + ref_count)
print(f'{v.CHROM}:{v.POS}:{v.REF}:{v.ALT[0]}\t{sample}\t{f}\t{gnomad_af:.4g}\t{gnomad_filter}\t{af:.5g}\t{alt_count}\t{ref_count}\t{n_count}\t{frag_errs}', file=ofh)
if __name__ == "__main__":
defopt.run(main)
function AF(sample, alt) {
var alt_count = Math.max(0, sample[alt])
var tot_count = Math.max(0, sample.A) + Math.max(0, sample.C) + Math.max(0, sample.G) + Math.max(0, sample.T) // + sample.N
return alt_count / tot_count;
}
function inherited(sample, alt) {
return AF(sample, alt) >= 0.2
}
function gonosomal(sample, alt) {
return AF(sample, alt) >= 0.05 && AF(sample, alt) < 0.2
}
function clonal(sample, alt) {
return AF(sample, alt) < 0.05 && sample[alt] > 1
}
function nonclonal(sample, alt) {
return AF(sample, alt) < 0.05 && sample[alt] == 1
}
function absent_in(sample, alt) {
return sample[alt] == 0
}
import pandas as pd
import sys
df = pd.read_csv('Spermseq_Sample_Tracker_Twinstrand.csv')
print('#sperm\tblood')
for g, rows in df.groupby('Donor_ID'):
# drop .1 suffix
samples = [x[:-2] for x in rows['Sample_ID']]
tissues = list(rows['Tissue'])
d = dict(zip(tissues, samples))
print(f'{d.get("Sperm_DNA", "")}\t{d.get("Blood_DNA", "")}')
import sys
import pandas as pd
df = pd.read_csv(sys.argv[1], sep="\t")
sub = df[ (df.alt_count_v2 > 1) & (df.tissue == "Sperm_DNA") &
(df.num_donors_blood > 0)
& ((df.inherited_blood_donor_ct > 0) | (df.inherited_sperm_and_blood_donor_ct))
& (df.error_prone_status != "error_prone_site") #&
& (df.var_filter == "PASS")
& (df['fgbio-alt-count'] > 0)
]
print(sub.shape, file=sys.stderr)
print("sample\tfilter\tcount")
for (sample, filter), f in sub.groupby(['sample', 'var_filter']):
n_vars = len(f['var_coord'].unique())
print(f'{sample}\t{filter}\t{n_vars}')
sub.to_csv("filt.tsv", sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment