Skip to content

Instantly share code, notes, and snippets.

@nievergeltlab
Last active January 15, 2021 18:25
Show Gist options
  • Save nievergeltlab/9a8ce64461ed31ad97d3bb2901977dec to your computer and use it in GitHub Desktop.
Save nievergeltlab/9a8ce64461ed31ad97d3bb2901977dec to your computer and use it in GitHub Desktop.
MVP summary statistics do not have rs-ids. This converts the sumstats to a vcf format then annotates the VCFs using SNP information from UCSC (or maybe it was dbSNP? file is called All_20180423.vcf.gz, just google it)
#LC_ALL=C join <(awk '{print $1}' w_hm3.snplist.sorted) <(awk '{print $22,$1}' eur_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.txt.annotated | LC_ALL=C sort -k1b,1) | awk '{print $2,$1}' | LC_ALL=C sort -k1b,1 > w_hm3.snplist.linked
for anc in his # eur aam asn his # "$anc"
do
#important to use ref and alt1 for this!
cat vcf_header1b.txt vcf_header2.txt <(awk '{OFS="\t"}{if (NR>1) print $1,$2, ".", $4,$5 , "100", "PASS", "MVP="$3}' "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa | sort -n -k1r,1 -k2,2 ) > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf
bcftools view "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf -Oz -o "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz
tabix -f "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz
#Some will not be annotated, need to identify these for my merge steps..
bcftools annotate -a All_20180423.vcf.gz -c INFO "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz -o "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated
echo "ID SNP" | awk 'BEGIN {OFS="\t"} {print $1,$2}' > snpheader.txt
tail -n+82 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated | grep RS | awk '{print $8}' | awk 'BEGIN {FS=";"}{OFS="\t"} {print $1,$2}' | sed 's/RS=/rs/g' | sed 's/MVP=//g' | cat snpheader.txt - | LC_ALL=C sort -k1b,1 > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.success
tail -n+82 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated | grep -v RS | awk '{print $8}' | awk 'BEGIN {FS=";"}{OFS="\t"} {print $1,$1}' | sed 's/MVP=//g' | cat snpheader.txt - | LC_ALL=C sort -k1b,1 > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.failed
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.success
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.failed
LC_ALL=C join -1 1 -2 3 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.success <(LC_ALL=C sort -k3b,3 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa) > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.success
LC_ALL=C join -1 1 -2 3 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz.annotated.failed <(LC_ALL=C sort -k3b,3 "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa) > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.failed
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.success
wc -l "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.failed
cat "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.success "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.failed | grep -v 2:27849893:CTGT | sort -g -k 22 | awk '{if(NR>1) print $2,$3,$4,$8,$9,$10,$13,$16,$17,$18,$21,$22}' | grep -v ID | gzip > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.fuma.gz
done
##fileformat=VCFv4.0
##fileDate=09052019
##source=mvpGWAS
##phasing=partial
##reference=GRCh37.p13
##INFO=<ID=MVP,Number=.,Type=String,Description=MVP allele>
#CHROM POS ID REF ALT QUAL FILTER INFO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment