Skip to content

Instantly share code, notes, and snippets.

@tkrahn
Created April 27, 2021 13:24
Show Gist options
  • Save tkrahn/6023b4ebc6ce6dfecdd3a5060843f307 to your computer and use it in GitHub Desktop.
Save tkrahn/6023b4ebc6ce6dfecdd3a5060843f307 to your computer and use it in GitHub Desktop.
How to extract YSTR markers from WGS400 using Last and tandem-genotypes
#!/bin/bash
START=$(date +%s.%N)
clear
# setup parameters
YSEQID=${PWD##*/}
# YSEQID="1234" # (the above command simply gets the name of the last segment of the current working directory)
NUM_THREADS=$(getconf _NPROCESSORS_ONLN)
echo "We can use ${NUM_THREADS} threads."
# Whole genome
# REF="/genomes/0/refseq/hg38/last/hg38.fa"
# REFDB="/genomes/0/refseq/hg38/last/mydb"
# ChrY only
REF="/genomes/0/refseq/hg38/last/hg38_chrY.fa"
REFDB="/genomes/0/refseq/hg38/last/chrYdb"
# If the reference is not yet indexed, we need to index it with lastdb:
# lastdb -P ${NUM_THREADS} -uNEAR ${REFDB} ${REF}
REFPAR="${YSEQID}_last.par"
# YSTR_hg38.bed is available here below vv
MICROSAT="/genomes/0/refseq/hg38/last/YSTR_hg38.bed"
CHRY_BAMFILE_INPUT="/genomes/arch/${YSEQID}/${YSEQID}_bwa-mem_hg38_chrY.bam"
READS="${YSEQID}_chrY_reads.fasta"
# Extract just the chrY reads in fasta format
samtools fasta ${CHRY_BAMFILE_INPUT} >${READS}
# separate a few training reads
head -n 100000 ${READS} >training_reads.fa
MAF="${YSEQID}_last_hg38.maf"
SAMFILE="${YSEQID}_last_hg38.sam"
BAMFILE="${YSEQID}_last_hg38.bam"
BAMFILE_SORTED="${YSEQID}_last_hg38_sorted.bam"
TANDEM_GENOTYPES="${YSEQID}_tandem_genotypes.tsv"
# Last DNA-DNA mapping software is available here:
# https://gitlab.com/mcfrith/last
# Train the substitution and gap rates (if necessary)
last-train -P${NUM_THREADS} -Q0 ${REFDB} training_reads.fa > ${REFPAR}
rm -f training_reads.fa
# Pipeline commands:
lastal -P${NUM_THREADS} -p ${REFPAR} ${REFDB} ${READS} | \
last-split -fMAF > ${MAF}
# Tandem-Genotypes is available here:
# https://github.com/mcfrith/tandem-genotypes
# Run Tandem-Genotypes
tandem-genotypes -f 7 ${MICROSAT} ${MAF} > ${TANDEM_GENOTYPES}
tandem-genotypes-plot -c6 -r21 ${TANDEM_GENOTYPES}
END=$(date +%s.%N)
DIFF=$(echo "$END - $START" | bc)
echo ${DIFF}
chrY 24005952 24006099 CCTT CDY 36
chrY 3284190 3284260 TATTT DXYS156 12
chrY 18278507 18278547 AAC DYF395 15
chrY 22022402 22022434 AAT DYF397 11
chrY 22950279 22950378 AAAG DYF399 24
chrY 23739648 23739707 GAAA DYF401 15
chrY 21681709 21681748 TATC DYF406S1 10
chrY 17912087 17912121 ATAG DYF408 8
chrY 18486683 18486740 AAAGG DYF411 11
chrY 20731419 20731492 CTTT DYR1 17
chrY 17388715 17388774 GAAA DYR33 15
chrY 12080037 12080096 TAGA DYR76 15
chrY 19302303 19302351 GAAA DYR112 12
chrY 11963243 11963291 ATAG DYR150 12
chrY 15211391 15211447 GAAG DYR157 14
chrY 9684380 9684443 TATC DYS19 15
chrY 18680632 18680687 GAAA DYS385 11
chrY 12635604 12635639 AAT DYS388 12
chrY 12500564 12500611 GATA DYS389B 16
chrY 12500448 12500516 GATA DYS389B.2 12
chrY 12500449 12500495 GATA DYS389I 12
chrY 15163067 15163162 TAGA DYS390 24
chrY 11982089 11982132 TCTA DYS391 11
chrY 20471987 20472025 AAT DYS392 13
chrY 3263111 3263158 AGAT DYS393 12
chrY 13987208 13987253 TG DYS413 23
chrY 18041622 18041651 TGT DYS425 10
chrY 17022970 17023005 TGT DYS426 12
chrY 12345806 12345841 ATCT DYS434 9
chrY 12384503 12384538 TGGA DYS435 11
chrY 13091948 13091983 AAC DYS436 12
chrY 12346267 12346326 TCTA DYS437 15
chrY 12825899 12825948 CTTTT DYS438 12
chrY 12403517 12403566 AGAT DYS439 12
chrY 12869907 12869966 TTCC DYS441 13
chrY 12649172 12649237 GATA DYS442 12
chrY 17114312 17114367 ATAG DYS444 13
chrY 19930716 19930763 TTTA DYS445 12
chrY 3263417 3263486 TCTCT DYS446 14
chrY 13166829 13166953 TATAT DYS447 23
chrY 22218924 22219078 AGAGAT DYS448 19
chrY 8349973 8350138 TTTC DYS449 30
chrY 8258259 8258303 TTTTA DYS450 8
chrY 19458592 19458746 TACTA DYS452 31
chrY 8356115 8356158 AAAT DYS454 11
chrY 7043528 7043571 AAAT DYS455 11
chrY 4402919 4402978 AGAT DYS456 15
chrY 7999839 7999902 GAAA DYS458 16
chrY 23932704 23932743 TTAT DYS459 10
chrY 18888956 18888995 TATC DYS460 10
chrY 18888804 18888851 TATC DYS461 12
chrY 19155161 19155204 TACA DYS462 11
chrY 7775468 7775587 GAAAG DYS463 24
chrY 24941464 24941523 CCTT DYS464 15
chrY 12500662 12500716 GATA DYS467 14
chrY 14396604 14396627 TAA DYS472 8
chrY 8558337 8558402 CTT DYS481 22
chrY 19937748 19937795 TTA DYS485 15
chrY 9046133 9046171 TTA DYS487 13
chrY 3575724 3575759 TTA DYS490 12
chrY 15302457 15302489 ATT DYS492 12
chrY 19224282 19224311 TTA DYS494 10
chrY 12899385 12899434 ATT DYS495 16
chrY 16053588 16053629 TTA DYS497 14
chrY 3035192 3035263 TCCT DYS504 18
chrY 3772790 3772837 TCCT DYS505 12
chrY 15188019 15188083 TAGA DYS510 17
chrY 15193041 15193080 ATAG DYS511 9
chrY 15329904 15329951 TATC DYS513 12
chrY 15208001 15208217 AAGA DYS518 38
chrY 7862381 7862470 GATA DYS520 19
chrY 7547584 7547623 GATA DYS522 10
chrY 7208814 7208853 TAGA DYS525 10
chrY 8598154 8598197 AAAT DYS531 11
chrY 8511105 8511164 TTCT DYS532 15
chrY 16281346 16281393 ATCT DYS533 12
chrY 16281096 16281155 CTTT DYS534 15
chrY 17246970 17247009 TCTA DYS537 10
chrY 16453355 16453402 TTAT DYS540 12
chrY 19358338 19358389 GATA DYS549 13
chrY 11981555 11981692 TCTA DYS552 24
chrY 20439567 20439610 AATA DYS556 11
chrY 21072786 21072889 TTTC DYS557 16
chrY 14452471 14452514 GATA DYS561 15
chrY 14414852 14414895 ATAA DYS565 12
chrY 8954511 8954554 AAAT DYS568 11
chrY 6993190 6993257 TTTC DYS570 17
chrY 3811619 3811658 AAAT DYS572 10
chrY 7568216 7568255 AAAT DYS575 10
chrY 7185318 7185385 AAAG DYS576 17
chrY 20400678 20400713 AAAT DYS578 9
chrY 15916825 15916915 ATACA DYS587 18
chrY 22339546 22339610 TTATT DYS589 12
chrY 8687939 8687978 TTTTG DYS590 8
chrY 16473882 16473956 AAAAC DYS593 15
chrY 19494951 19495000 AAATA DYS594 10
chrY 16302502 16302577 GAAG DYS607 15
chrY 8633472 8633762 CTT DYS614 18
chrY 16969638 16969673 TTA DYS617 12
chrY 22270860 22270932 AAAG DYS626 27
chrY 13797171 13797206 CATT DYS632 9
chrY 12258860 12258951 ATAG DYS635 23
chrY 20472971 20473014 TTTA DYS636 12
chrY 15533611 15533654 ATTT DYS638 11
chrY 3411661 3411696 AAAT DYS640 11
chrY 14022416 14022455 TAAA DYS641 10
chrY 15314132 15314186 CTTTT DYS643 11
chrY 15669604 15669694 TTTTA DYS644 16
chrY 10015306 10015373 AAGG DYS650 18
chrY 56887113 56887142 GTT DYS655 10
chrY 7849529 7849775 CTTTC DYS684 57
chrY 17324793 17324930 AAAG DYS710 34.2
chrY 8465426 8465675 CTT DYS711 64
chrY 13446553 13446620 AGAT DYS712 22
chrY 7964032 7964245 TCTT DYS713 42
chrY 19985845 19985979 TTTCT DYS714 27
chrY 15550275 15550423 ATGG DYS715 26
chrY 10629615 10629699 CTCCA DYS716 27
chrY 15201365 15201444 TGTAT DYS717 20
chrY 15170062 15170137 GATA DYS723 20
chrY 23104466 23104537 GT DYS725 32
chrY 11446663 11446765 AAGG DYS726 12
chrY 11910967 11911130 TTTC DYS727 13
chrY 18642119 18642156 TG DYS728 19
chrY 16606999 16607058 TATC Y-GATA-A10 13
chrY 16631673 16631720 TCTA Y-GATA-H4 12
chrY 10687563 10687627 TTCCA Y-GGAAT-1B07 10
chrY 17510231 17510276 CA YCAII 23
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment