Skip to content

Instantly share code, notes, and snippets.

@tkrahn
Created May 13, 2019 16:14
Show Gist options
  • Save tkrahn/3c24c4b8896e999f0161f7d0137444de to your computer and use it in GitHub Desktop.
Save tkrahn/3c24c4b8896e999f0161f7d0137444de to your computer and use it in GitHub Desktop.
#!/bin/bash
START=$(date +%s.%N)
clear
# setup parameters
YSEQID=${PWD##*/}
# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)
NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
echo "We can use ${NUM_THREADS} threads."
REF="/genomes/0/refseq/hg19/hg19.fa"
READS_1="fastq/${YSEQID}_R1.fastq.gz"
READS_2="fastq/${YSEQID}_R2.fastq.gz"
BAMFILE="${YSEQID}_bwa-mem_hg19.bam"
BAMFILE_SORTED="${YSEQID}_bwa-mem_hg19_sorted.bam"
VCF_FILE="${YSEQID}_hg19.vcf"
REF_23ANDME="23andMe_all_hg19_ref.tab"
# Prepare newest 23andMe Reference from their API:
wget -O 23andMe_all_hg19_raw.tab https://api.23andme.com/1/genome_snp_map/
echo "#CHROM POS ID" >23andMe_all_hg19_ref.tab
while IFS=$'\t' read -r index snp chromosome chromosome_position
do
if [[ $index == \#* || $index =~ ^index ]]; then
echo "skipping $index"
else
CHROM=${chromosome//MT/M}
echo "chr${CHROM} ${chromosome_position} ${snp}" >>23andMe_all_hg19_unsorted.tab
fi
done < 23andMe_all_hg19_raw.tab
sort -t $'\t' -k1,2 -V 23andMe_all_hg19_unsorted.tab > 23andMe_all_hg19_sorted.tab
cat 23andMe_all_hg19_sorted.tab >> 23andMe_all_hg19_ref.tab
bgzip -c 23andMe_all_hg19_ref.tab > 23andMe_all_hg19_ref.tab.gz
tabix -s1 -b2 -e2 23andMe_all_hg19_ref.tab.gz
#rm -f 23andMe_all_hg19_raw.tab
#rm -f 23andMe_all_hg19_unsorted.tab
#rm -f 23andMe_all_hg19_sorted.tab
#rm -f 23andMe_all_hg19_ref.tab
# Pipeline commands:
bwa mem -t $NUM_THREADS -M $REF $READS_1 $READS_2 | \
samtools view -@ $NUM_THREADS -b -t $REF -o $BAMFILE -
samtools sort -@ $NUM_THREADS -T /usr/local/geospiza/var/tmp/sorted -o $BAMFILE_SORTED $BAMFILE
samtools index $BAMFILE_SORTED
samtools idxstats $BAMFILE_SORTED > ${BAMFILE_SORTED}.idxstats.tsv
#samtools stats $BAMFILE_SORTED > ${BAMFILE_SORTED}.stats
# Delete no longer needed large files
#rm -f $SAMFILE
rm -f $BAMFILE # keep $BAMFILE_SORTED
# Easy tview file
echo "#!/bin/bash" > tview_${YSEQID}.sh
echo "samtools tview ${BAMFILE_SORTED} ${REF}" >> tview_${YSEQID}.sh
chmod a+x tview_${YSEQID}.sh
# Generate 23andMe mockup file
samtools mpileup -C 50 -v -l ${REF_23ANDME}.gz -f $REF $BAMFILE_SORTED > 23andMe_raw_${VCF_FILE}.gz
tabix -p vcf 23andMe_raw_${VCF_FILE}.gz
bcftools call -O z -V indels -m -P 0 23andMe_raw_${VCF_FILE}.gz > 23andMe_called_${VCF_FILE}.gz
tabix -p vcf 23andMe_called_${VCF_FILE}.gz
bcftools annotate -O z -a ${REF_23ANDME}.gz -c CHROM,POS,ID 23andMe_called_${VCF_FILE}.gz > 23andMe_annotated_${VCF_FILE}.gz
tabix -p vcf 23andMe_annotated_${VCF_FILE}.gz
bcftools query -f '%ID\t%CHROM\t%POS[\t%TGT]\n' 23andMe_annotated_${VCF_FILE}.gz | \
sed 's/chr//' | \
sed 's/\tM\t/\tMT\t/' | \
sed 's/\///' | \
sed 's/\.\.$/--/' | \
sed 's/TA$/AT/' | \
sed 's/TC$/CT/' | \
sed 's/TG$/GT/' | \
sed 's/GA$/AG/' | \
sed 's/GC$/CG/' | \
sed 's/CA$/AC/' > ${YSEQID}_23andMe_all_hg19.tab
sort -t $'\t' -k2,3 -V ${YSEQID}_23andMe_all_hg19.tab > ${YSEQID}_23andMe_all_hg19_sorted.tab
DATE=$(date +%Y-%m-%d_%H:%M:%S)
# 23andMe header
echo "# This data file generated by 23andMe at: ${DATE}" > ${YSEQID}_23andMe_all_hg19.txt
echo '#' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# This file contains raw genotype data, including data that is not used in 23andMe reports.' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# This data has undergone a general quality review however only a subset of markers have been ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# individually validated for accuracy. As such, this data is suitable only for research, ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# educational, and informational use and not for medical or other use.' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# Below is a text version of your data. Fields are TAB-separated' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# Each line corresponds to a single SNP. For each SNP, we provide its identifier ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# (an rsid or an internal id), its location on the reference human genome, and the ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# genotype call oriented with respect to the plus strand on the human reference sequence.' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# We are using reference human assembly build 37 (also known as Annotation Release 104).' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# Note that it is possible that data downloaded at different times may be different due to ongoing ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# improvements in our ability to call genotypes. More information about these changes can be found at:' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# https://www.23andme.com/you/download/revisions/' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# ' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# More information on reference human assembly build 37 (aka Annotation Release 104):' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606' >> ${YSEQID}_23andMe_all_hg19.txt
echo '#' >> ${YSEQID}_23andMe_all_hg19.txt
echo '# rsid chromosome position genotype' >> ${YSEQID}_23andMe_all_hg19.txt
cat ${YSEQID}_23andMe_all_hg19_sorted.tab >> ${YSEQID}_23andMe_all_hg19.txt
zip ${YSEQID}_23andMe_all_hg19.zip ${YSEQID}_23andMe_all_hg19.txt
END=$(date +%s.%N)
DIFF=$(echo "$END - $START" | bc)
echo ${DIFF}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment