Last active
February 3, 2019 22:49
-
-
Save jsun/70df6e0bcd5f8e63bf1e02d0785d5e14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#$ -l rt_F=10 | |
#$ -l h_rt=24:00:00 | |
#$ -j y | |
#$ -cwd | |
#$ -N qn_makeGenomeIndex | |
pDwGenome=0 # download sequences and annotations from IWGSC server | |
pSplitGenomeSeq=0 # split whole genome sequences into A, B, and D subgenomes | |
pLAST=0 # use LAST to find homologoues sequences | |
pFindHomeologs=0 # summarise LAST rseults and find homeolog relations | |
pSTAR=1 # creat STAR index for data analyzing | |
# program env path | |
BIN=/home/username/local/bin # gffread, LAST(lastdb, lastal), STAR | |
SCRIPTS=/home/username/local/scripts # make_subgenome_fasta.py | |
EAGLE_SCRIPTS=/home/username/local/src/eagle/scripts # homeolog_genotypes.py, tablize.py, triple_homeolog | |
# project path | |
GENOME_DIR=/home/username/data/genome/IWGSC_RefSeq | |
INDEX_DIR=${GENOME_DIR}/index | |
CPU=10 | |
# start process | |
mkdir -p ${GENOME_DIR} | |
mkdir -p ${INDEX_DIR} | |
cd ${GENOME_DIR} | |
# download genome files from IWGSC | |
# in this work, we use version 1.1 annotations | |
if [ ${pDwGenome} -eq 1 ]; then | |
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Assemblies/v1.0/iwgsc_refseqv1.0_all_chromosomes.zip | |
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Assemblies/v1.0/iwgsc_refseqv1.0_all_chromosomes.zip.md5.txt | |
unzip iwgsc_refseqv1.0_all_chromosomes.zip | |
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.1/iwgsc_refseqv1.1_genes_2017July06.zip | |
wget https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.1/iwgsc_refseqv1.1_genes_2017July06.zip.md5.txt | |
unzip iwgsc_refseqv1.1_genes_2017July06.zip | |
fi | |
# split whole genome sequences into three parts of A, B, and D subgenomes | |
if [ ${pSplitGenomeSeq} -eq 1 ]; then | |
# make copies (links) of genome sequencse and annotations with simple names | |
ln -s iwgsc_refseqv1.0_all_chromosomes/161010_Chinese_Spring_v1.0_pseudomolecules.fasta cs.genome.fa | |
ln -s iwgsc_refseqv1.1_genes_2017July06/IWGSC_v1.1_HC_20170706.gtf cs.genome.gtf | |
ln -s iwgsc_refseqv1.1_genes_2017July06/IWGSC_v1.1_HC_20170706.gff3 cs.genome.gff3 | |
# creat subgenome annotaitons | |
grep '^chr.A' cs.genome.gtf > cs.genome.chrA.gtf | |
grep '^chr.B' cs.genome.gtf > cs.genome.chrB.gtf | |
grep '^chr.D' cs.genome.gtf > cs.genome.chrD.gtf | |
# create subgenome transcript sequences with the annotations | |
# gffread -w: write a fasta file with spliced exons for each GFF transcript | |
# gffread -x: write a fasta file with spliced CDS for each GFF transcript | |
# ${BIN}/gffread -w cs.transcripts.fa -g cs.genome.fa cs.genome.gff3 | |
${BIN}/gffread -x cs.cds.chrA.fa -g cs.genome.fa cs.genome.chrA.gtf | |
${BIN}/gffread -x cs.cds.chrB.fa -g cs.genome.fa cs.genome.chrB.gtf | |
${BIN}/gffread -x cs.cds.chrD.fa -g cs.genome.fa cs.genome.chrD.gtf | |
fi | |
# run LAST | |
if [ ${pLAST} -eq 1 ]; then | |
mkdir -p ${INDEX_DIR} | |
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrA cs.cds.chrA.fa | |
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrB cs.cds.chrB.fa | |
${BIN}/lastdb -uNEAR -R01 ${INDEX_DIR}/last_cs.cds.chrD cs.cds.chrD.fa | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrA -P$CPU -D10000000000 cs.cds.chrB.fa | ${BIN}/last-map-probs -m 0.49 > A.B.maf | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrB -P$CPU -D10000000000 cs.cds.chrA.fa | ${BIN}/last-map-probs -m 0.49 > B.A.maf | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrB -P$CPU -D10000000000 cs.cds.chrD.fa | ${BIN}/last-map-probs -m 0.49 > B.D.maf | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrD -P$CPU -D10000000000 cs.cds.chrB.fa | ${BIN}/last-map-probs -m 0.49 > D.B.maf | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrA -P$CPU -D10000000000 cs.cds.chrD.fa | ${BIN}/last-map-probs -m 0.49 > A.D.maf | |
${BIN}/lastal ${INDEX_DIR}/last_cs.cds.chrD -P$CPU -D10000000000 cs.cds.chrA.fa | ${BIN}/last-map-probs -m 0.49 > D.A.maf | |
fi | |
# find homeolog relations | |
if [ ${pFindHomeologs} -eq 1 ]; then | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o A.vs.B -f exon -g cs.genome.gtf A.B.maf B.A.maf | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o B.vs.A -f exon -g cs.genome.gtf B.A.maf A.B.maf | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o B.vs.D -f exon -g cs.genome.gtf B.D.maf D.B.maf | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o D.vs.B -f exon -g cs.genome.gtf D.B.maf B.D.maf | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o A.vs.D -f exon -g cs.genome.gtf A.D.maf D.A.maf | |
python ${EAGLE_SCRIPTS}/homeolog_genotypes.py -o D.vs.A -f exon -g cs.genome.gtf D.A.maf A.D.maf | |
# find homeologs | |
perl ${EAGLE_SCRIPTS}/triple_homeolog.pl A.vs.B.reciprocal_best B.vs.D.reciprocal_best A.vs.D.reciprocal_best > homeolog.ABD.list | |
# creat ID-exchange information to change chrB and chrD IDs inot chrA ID. | |
cut -f 1 homeolog.ABD.list > homeolog.A.list | |
awk '{print $2"\t"$1;}' homeolog.ABD.list > homeolog.B.list | |
awk '{print $3"\t"$1;}' homeolog.ABD.list > homeolog.D.list | |
# find subgenome-specific genes | |
cat A.vs.B.reciprocal_best A.vs.D.reciprocal_best | cut -f1 | sort | uniq > A.vs.all.list | |
cat B.vs.A.reciprocal_best B.vs.D.reciprocal_best | cut -f1 | sort | uniq > B.vs.all.list | |
cat D.vs.A.reciprocal_best D.vs.B.reciprocal_best | cut -f1 | sort | uniq > D.vs.all.list | |
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.A\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrA.cds.list | |
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.B\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrB.cds.list | |
grep $'mRNA\t' cs.genome.gff3 | grep $'chr.D\t' | perl -ne 'chomp; m/ID=(.*?);/; print "$1\n";' > chrD.cds.list | |
python ${EAGLE_SCRIPTS}/tablize.py -v0 A.vs.all.list chrA.cds.list > chrA.only.list | |
python ${EAGLE_SCRIPTS}/tablize.py -v0 B.vs.all.list chrB.cds.list > chrB.only.list | |
python ${EAGLE_SCRIPTS}/tablize.py -v0 D.vs.all.list chrD.cds.list > chrD.only.list | |
fi | |
# run STAR to creat index files | |
if [ ${pSTAR} -eq 1 ]; then | |
# make sub-genome sequences (NOT CDS sequences!) | |
python ${SCRIPTS}/make_subgenome_fasta.py cs.genome.fa cs.genome.chrA.fa cs.genome.chrB.fa cs.genome.chrD.fa | |
mkdir -p ${INDEX_DIR}/star_cs.genome.chrA | |
mkdir -p ${INDEX_DIR}/star_cs.genome.chrB | |
mkdir -p ${INDEX_DIR}/star_cs.genome.chrD | |
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrA --genomeFastaFiles cs.genome.chrA.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU | |
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrB --genomeFastaFiles cs.genome.chrB.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU | |
${BIN}/STAR --runMode genomeGenerate --genomeDir ${INDEX_DIR}/star_cs.genome.chrD --genomeFastaFiles cs.genome.chrD.fa --sjdbGTFfile cs.genome.gtf --runThreadN $CPU | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment