Skip to content

Instantly share code, notes, and snippets.

@jsun
Last active February 3, 2019 22:49
Show Gist options
  • Save jsun/70df6e0bcd5f8e63bf1e02d0785d5e14 to your computer and use it in GitHub Desktop.
Save jsun/70df6e0bcd5f8e63bf1e02d0785d5e14 to your computer and use it in GitHub Desktop.
#!/bin/bash
#$ -l rt_F=10
#$ -l h_rt=24:00:00
#$ -j y
#$ -cwd
#$ -N qn_makeGenomeIndex
pDwGenome=0 # download sequences and annotations from IWGSC server
pSplitGenomeSeq=0 # split whole genome sequences into A, B, and D subgenomes
pLAST=0 # use LAST to find homologoues sequences
pFindHomeologs=0 # summarise LAST rseults and find homeolog relations
pSTAR=1 # creat STAR index for data analyzing
# program env path
BIN=/home/username/local/bin # gffread, LAST(lastdb, lastal), STAR
SCRIPTS=/home/username/local/scripts # make_subgenome_fasta.py
EAGLE_SCRIPTS=/home/username/local/src/eagle/scripts # homeolog_genotypes.py, tablize.py, triple_homeolog
# project path
GENOME_DIR=/home/username/data/genome/IWGSC_RefSeq
INDEX_DIR=${GENOME_DIR}/index
CPU=10
# start process
mkdir -p ${GENOME_DIR}
mkdir -p ${INDEX_DIR}
cd ${GENOME_DIR}
# download genome files from IWGSC
# in this work, we use version 1.1 annotations
if [ ${pDwGenome} -eq 1 ]; then
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Assemblies/v1.0/iwgsc_refseqv1.0_all_chromosomes.zip
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Assemblies/v1.0/iwgsc_refseqv1.0_all_chromosomes.zip.md5.txt
unzip iwgsc_refseqv1.0_all_chromosomes.zip
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.1/iwgsc_refseqv1.1_genes_2017July06.zip
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.1/iwgsc_refseqv1.1_genes_2017July06.zip.md5.txt
unzip iwgsc_refseqv1.1_genes_2017July06.zip
fi
# split whole genome sequences into three parts of A, B, and D subgenomes
if [ ${pSplitGenomeSeq} -eq 1 ]; then
# make copies (links) of genome sequencse and annotations with simple names
ln -s iwgsc_refseqv1.0_all_chromosomes/161010_Chinese_Spring_v1.0_pseudomolecules.fasta cs.genome.fa
ln -s iwgsc_refseqv1.1_genes_2017July06/IWGSC_v1.1_HC_20170706.gtf cs.genome.gtf
ln -s iwgsc_refseqv1.1_genes_2017July06/IWGSC_v1.1_HC_20170706.gff3 cs.genome.gff3
# creat subgenome annotaitons
grep '^chr.A' cs.genome.gtf > cs.genome.chrA.gtf
grep '^chr.B' cs.genome.gtf > cs.genome.chrB.gtf
grep '^chr.D' cs.genome.gtf > cs.genome.chrD.gtf
# create subgenome transcript sequences with the annotations
# gffread -w: write a fasta file with spliced exons for each GFF transcript
# gffread -x: write a fasta file with spliced CDS for each GFF transcript
# ${BIN}/gffread -w cs.transcripts.fa -g cs.genome.fa cs.genome.gff3
${BIN}/gffread -x cs.cds.chrA.fa -g cs.genome.fa cs.genome.chrA.gtf
${BIN}/gffread -x cs.cds.chrB.fa -g cs.genome.fa cs.genome.chrB.gtf
${BIN}/gffread -x cs.cds.chrD.fa -g cs.genome.fa cs.genome.chrD.gtf
fi
# run LAST
if [ ${pLAST} -eq 1 ]; then
mkdir -p ${INDEX_DIR}
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrA cs.cds.chrA.fa
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrB cs.cds.chrB.fa
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrD cs.cds.chrD.fa
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrA -P$CPU -D10000000000 cs.cds.chrB.fa | ${BIN}/last-map-probs -m 0.49 > A.B.maf
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrB -P$CPU -D10000000000 cs.cds.chrA.fa | ${BIN}/last-map-probs -m 0.49 > B.A.maf
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrB -P$CPU -D10000000000 cs.cds.chrD.fa | ${BIN}/last-map-probs -m 0.49 > B.D.maf
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrD -P$CPU -D10000000000 cs.cds.chrB.fa | ${BIN}/last-map-probs -m 0.49 > D.B.maf
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrA -P$CPU -D10000000000 cs.cds.chrD.fa | ${BIN}/last-map-probs -m 0.49 > A.D.maf
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrD -P$CPU -D10000000000 cs.cds.chrA.fa | ${BIN}/last-map-probs -m 0.49 > D.A.maf
fi
# find homeolog relations
if [ ${pFindHomeologs} -eq 1 ]; then
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o A.vs.B -f exon -g cs.genome.gtf A.B.maf B.A.maf
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o B.vs.A -f exon -g cs.genome.gtf B.A.maf A.B.maf
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o B.vs.D -f exon -g cs.genome.gtf B.D.maf D.B.maf
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o D.vs.B -f exon -g cs.genome.gtf D.B.maf B.D.maf
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o A.vs.D -f exon -g cs.genome.gtf A.D.maf D.A.maf
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o D.vs.A -f exon -g cs.genome.gtf D.A.maf A.D.maf
# find homeologs
perl ${EAGLE_SCRIPTS}/triple_homeolog.pl A.vs.B.reciprocal_best B.vs.D.reciprocal_best A.vs.D.reciprocal_best > homeolog.ABD.list
# creat ID-exchange information to change chrB and chrD IDs inot chrA ID.
cut -f 1 homeolog.ABD.list > homeolog.A.list
awk '{print $2"\t"$1;}' homeolog.ABD.list > homeolog.B.list
awk '{print $3"\t"$1;}' homeolog.ABD.list > homeolog.D.list
# find subgenome-specific genes
cat A.vs.B.reciprocal_best A.vs.D.reciprocal_best | cut -f1 | sort | uniq > A.vs.all.list
cat B.vs.A.reciprocal_best B.vs.D.reciprocal_best | cut -f1 | sort | uniq > B.vs.all.list
cat D.vs.A.reciprocal_best D.vs.B.reciprocal_best | cut -f1 | sort | uniq > D.vs.all.list
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.A\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrA.cds.list
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.B\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrB.cds.list
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.D\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrD.cds.list
python ${EAGLE_SCRIPTS}/tablize.py -v0 A.vs.all.list chrA.cds.list > chrA.only.list
python ${EAGLE_SCRIPTS}/tablize.py -v0 B.vs.all.list chrB.cds.list > chrB.only.list
python ${EAGLE_SCRIPTS}/tablize.py -v0 D.vs.all.list chrD.cds.list > chrD.only.list
fi
# run STAR to creat index files
if [ ${pSTAR} -eq 1 ]; then
# make sub-genome sequences (NOT CDS sequences!)
python ${SCRIPTS}/make_subgenome_fasta.py cs.genome.fa cs.genome.chrA.fa cs.genome.chrB.fa cs.genome.chrD.fa
mkdir -p ${INDEX_DIR}/star_cs.genome.chrA
mkdir -p ${INDEX_DIR}/star_cs.genome.chrB
mkdir -p ${INDEX_DIR}/star_cs.genome.chrD
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrA --genomeFastaFiles cs.genome.chrA.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrB --genomeFastaFiles cs.genome.chrB.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrD --genomeFastaFiles cs.genome.chrD.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment