Skip to content

Instantly share code, notes, and snippets.

@k3yavi
Created October 2, 2015 19:29
Show Gist options
  • Save k3yavi/ec251ef47f400bac11bc to your computer and use it in GitHub Desktop.
Save k3yavi/ec251ef47f400bac11bc to your computer and use it in GitHub Desktop.
import sys
##########################################################
#requires snakemake, python3, pyfasta to be installed
#save this file and provide all the binaries and their path
#in variables below.
#to run flux pipeline:
#snakemake run_flux_pipeline
#to run rsem pipeline:
#snakemake run_rsem_pipeline
##########################################################
#-------------------------------------------------
#Initial variable
#-------------------------------------------------
pwd = os.getcwd()
##########################################################
#Paths for binaries
#We assume these binaries are available and their path can
#be specifies below:
#1. flux simulator.
#2. rsem-prepare-reference
#3. sailfish (commit 8ec6df16dd5af1a2278d875c143cb7f736339771)
#4. kallisto (version 0.42.3)
#5. bowtie2-build
#6. bowtie2
#7. parseAlignment (Bitseq) (version 0.7.5)
#8. estimateVBExpression (Bitseq) (version 0.7.5)
##########################################################
flux_binary="{}/binaries/flux-simulator-1.2.1/bin/flux-simulator".format(pwd)
rsem_prepare_ref_binary="{}/binaries/rsem-1.2.22/rsem-prepare-reference".format(pwd)
sailfish_binary="{}/binaries/SailfishBeta-latest_ubuntu-12.04/bin/sailfish".format(pwd)
kallisto_binary="{}/binaries/kallisto/build/src/kallisto".format(pwd)
bowtie2_build_binary="/mnt/scratch3/avi/bowtie2-master/bowtie2-build"
bowtie2_binary="/mnt/scratch3/avi/bowtie2-master/bowtie2"
bitseq_parse_binary="{}/binaries/BitSeq/parseAlignment".format(pwd)
bitseq_estimate_binary="{}/binaries/BitSeq/estimateVBExpression".format(pwd)
##########################################################
##########################################################
#rsem data
#We assume rsem has been run with kallisto's pipeline
#Need to give path of reference and PE reads generated
##########################################################
#Transcriptome
rsem_transcriptome_fullpath="/mnt/scratch1/simulated_data/rsem_sim_data/refs/Homo_sapiens.GRCh37.75.cdna.pc.fa"
rsem_first_mate_path = "/mnt/scratch1/simulated_data/rsem_sim_data/11_1.fq"
rsem_second_mate_path = "/mnt/scratch1/simulated_data/rsem_sim_data/11_2.fq"
##########################################################
#flux data
##########################################################
#Genome
flux_genome_file="Homo_sapiens.GRCh38.dna.toplevel.fa"
flux_genome_ftp_path="ftp://ftp.ensembl.org/pub/release-80/fasta/homo_sapiens/dna/{}.gz".format(flux_genome_file)
flux_genome_path="{}/data/Human_Genome".format(pwd)
flux_genome_fullpath="{}/{}".format(flux_genome_path, flux_genome_file)
#Gene annotation
flux_annotations_file="Homo_sapiens.GRCh38.80.gtf"
flux_annotations_ftp_prefix="ftp://ftp.ensembl.org/pub/release-80/gtf/homo_sapiens"
flux_annotations_path="{}/data/HumanGenomeAnnotations".format(pwd)
flux_annotations_fullpath = "{}/{}".format(flux_annotations_path, flux_annotations_file)
#Transcriptome
flux_transcriptome_path = "{}/data/Transcriptome".format(pwd)
flux_transcriptome_file="Human_Genome_filtered.transcripts.fa"
flux_transcriptome_prefix="Human_Genome"
flux_raw_transcriptome_path = "{}/data/Transcriptome/{}.transcripts.fa".format(pwd, flux_transcriptome_prefix)
flux_transcriptome_fullpath="{}/{}".format(flux_transcriptome_path, flux_transcriptome_file)
#flux_transcriptome_fullpath="/mnt/scratch1/avi/data/bigdata/hg19/reference.transcripts.fa"
###########################################################
#flux variables
flux_data_path = "{}/data/FluxSim".format(pwd)
flux_reads_path = "{}/reads".format(flux_data_path)
flux_first_mate_path = "{}/r1.fq".format(flux_reads_path)
flux_second_mate_path = "{}/r2.fq".format(flux_reads_path)
#flux_first_mate_path = "/mnt/scratch1/avi/data/25M/1.fastq"
#flux_second_mate_path = "/mnt/scratch1/avi/data/25M/2.fastq"
flux_config = """TMP_DIR {}/tmp
### Expression ###
REF_FILE_NAME {}/{}
GEN_DIR {}
## Library preparation
# # Expression
#
NB_MOLECULES 5000000
TSS_MEAN 50
POLYA_SCALE 100
POLYA_SHAPE 1.5
#
#### Fragmentation ###
FRAG_SUBSTRATE RNA
FRAG_METHOD UR
FRAG_UR_ETA 350
#
#### Reverse Transcription ###
RTRANSCRIPTION YES
RT_MOTIF default
#
#### Amplification ###
GC_MEAN NaN
PCR_PROBABILITY 0.05
PCR_DISTRIBUTION default
#### Size Filtering ###
FILTERING YES
#### Sequencing ###
READ_NUMBER 60000000
READ_LENGTH 76
PAIRED_END YES
ERR_FILE 76
FASTA YES
""".format(flux_data_path, flux_annotations_path, flux_annotations_file, flux_genome_path)
###################################################
# Quantifier variables
###################################################
sailfish_flux_index_path = "{}/data/sailfish/flux/index".format(pwd)
sailfish_flux_quant_path = "{}/data/sailfish/flux/quant".format(pwd)
sailfish_threads = 20
kallisto_flux_index_path = "{}/data/kallisto/flux/index".format(pwd)
kallisto_flux_quant_path = "{}/data/kallisto/flux/quant".format(pwd)
kallisto_flux_index_prefix = "index"
bowtie2_flux_index_path = "{}/data/bowtie2/flux/index".format(pwd)
bowtie2_flux_align_path = "{}/data/bowtie2/flux/align".format(pwd)
bowtie2_flux_sam = "{}/data1-1.sam".format(bowtie2_flux_align_path)
bowtie2_index_prefix="index"
bowtie2_threads = 20
bitseq_flux_parse_path = "{}/data/bitseq/flux/parse".format(pwd)
bitseq_flux_prob_path = "{}/data.prob".format(bitseq_flux_parse_path)
bitseq_flux_trinfo_path = "{}/ensemblGenes.tr".format(bitseq_flux_parse_path)
bitseq_flux_quant_path = "{}/data/bitseq/flux/quant".format(pwd)
bitseq_threads = 20
sailfish_rsem_index_path = "{}/data/sailfish/rsem/index".format(pwd)
sailfish_rsem_quant_path = "{}/data/sailfish/rsem/quant".format(pwd)
kallisto_rsem_index_path = "{}/data/kallisto/rsem/index".format(pwd)
kallisto_rsem_quant_path = "{}/data/kallisto/rsem/quant".format(pwd)
kallisto_rsem_index_prefix = "index"
bowtie2_rsem_index_path = "{}/data/bowtie2/rsem/index".format(pwd)
bowtie2_rsem_align_path = "{}/data/bowtie2/rsem/align".format(pwd)
bowtie2_rsem_sam = "{}/data1-1.sam".format(bowtie2_rsem_align_path)
bitseq_rsem_parse_path = "{}/data/bitseq/rsem/parse".format(pwd)
bitseq_rsem_prob_path = "{}/data.prob".format(bitseq_rsem_parse_path)
bitseq_rsem_trinfo_path = "{}/ensemblGenes.tr".format(bitseq_rsem_parse_path)
bitseq_rsem_quant_path = "{}/data/bitseq/rsem/quant".format(pwd)
bitseq_prefix = "quant"
###################################################
simulator = ["fluxsim", "rsem"]
#--------------------------------------------------
#Rules for running test
#--------------------------------------------------
rule obtain_flux_genome:
output:
flux_genome_path
shell:
"""
mkdir -p {flux_genome_path}
cd {flux_genome_path}
wget {flux_genome_ftp_path}
gunzip {flux_genome_file}.gz
pyfasta split --header="%(seqid)s.fasta" {flux_genome_file}
rename -v 's/(\w)\s+.*\.fasta/$1.fa/' *.fasta
"""
rule obtain_flux_annotations:
output:
flux_annotations_fullpath
shell:
"""
mkdir -p {flux_annotations_path}
cd {flux_annotations_path}
wget {flux_annotations_ftp_prefix}/{flux_annotations_file}.gz
gunzip {flux_annotations_file}.gz
"""
rule create_flux_transcriptome:
input:
genome={flux_genome_path},
gtf= {flux_annotations_fullpath}
output:
flux_raw_transcriptome_path
shell:
"""
mkdir -p {flux_transcriptome_path}
{rsem_prepare_ref_binary} --gtf {input.gtf} {input.genome} {flux_transcriptome_path}/{flux_transcriptome_prefix}
"""
rule filter_flux_transcriptome:
input:
flux_raw_transcriptome_path
output:
flux_transcriptome_fullpath
run:
#taken from https://www.biostars.org/p/667/
from pyfasta import Fasta
import re
f = Fasta("{}".format(flux_raw_transcriptome_path))
ncar = 60
with open("{}".format(flux_transcriptome_fullpath), "w") as out_file:
for header in f.keys():
name = str(header)
seq = str(f[header])
if not bool(re.match('[n]*$', seq, re.IGNORECASE)):
out_file.write('>'+ name + "\n")
while len(seq) > 0:
out_file.write(seq[:ncar] + "\n")
seq = seq[ncar:]
rule run_flux:
input:
flux_genome_path,
flux_annotations_fullpath
output:
flux_data_path
run:
shell("mkdir -p {}".format(flux_data_path))
shell("rm -rf {}/tmp".format(flux_data_path))
shell("mkdir -p {}/tmp".format(flux_data_path))
flux_config_fname = "{}/config.par".format(flux_data_path)
with open(flux_config_fname, "w") as f:
f.write(flux_config)
#might have to change set -x FLUX_MEM '20G' to export FLUX_MEM='20G' if using bash
shell("set -x FLUX_MEM '20G'; {} -p {} -l -s -x".format(flux_binary, flux_config_fname))
rule generate_flux_reads:
input:
flux_data_path
output:
flux_first_mate_path,
flux_second_mate_path
run:
shell("mkdir -p {}".format(flux_reads_path))
r1 = open('{}/r1.fq'.format(flux_reads_path), 'w')
r2 = open('{}/r2.fq'.format(flux_reads_path), 'w')
[r1.write(line) if (i % 8 < 4) else r2.write(line) for i, line in enumerate(open("{}/config.fastq".format(flux_data_path), 'r'))]
r1.close()
r2.close()
rule run_sailfish_flux_index:
input:
transcripts={flux_transcriptome_fullpath}
output:
sailfish_flux_index_path
threads:
sailfish_threads
shell:
"""
mkdir -p {sailfish_flux_index_path}
{sailfish_binary} index -t {input.transcripts} -k 31 -o {sailfish_flux_index_path} -p {sailfish_threads}
"""
rule run_sailfish_flux_quant:
input:
first = {flux_first_mate_path},
second = {flux_second_mate_path},
index = {sailfish_flux_index_path}
output:
sailfish_flux_quant_path
threads:
sailfish_threads
shell:
"""
mkdir -p {sailfish_flux_quant_path}
{sailfish_binary} quant -i {input.index} -l IU -1 {input.first} -2 {input.second} -p {sailfish_threads} -o {sailfish_flux_quant_path} --useVBOpt
"""
rule run_kallisto_flux_index:
input:
transcripts = {flux_transcriptome_fullpath}
output:
kallisto_flux_index_path
shell:
"""
mkdir -p {kallisto_flux_index_path}
{kallisto_binary} index -i {kallisto_flux_index_path}/{kallisto_flux_index_prefix} -k 31 {input.transcripts}
"""
rule run_kallisto_flux_quant:
input:
first = {flux_first_mate_path},
second = {flux_second_mate_path},
index = {kallisto_flux_index_path}
output:
kallisto_flux_quant_path
shell:
"""
mkdir -p {kallisto_flux_quant_path}
{kallisto_binary} quant -i {kallisto_flux_index_path}/{kallisto_flux_index_prefix} -o {kallisto_flux_quant_path} {input.first} {input.second}
"""
rule run_bowtie2_flux_index:
input:
transcripts = {flux_transcriptome_fullpath}
output:
bowtie2_flux_index_path
threads:
bowtie2_threads
shell:
"""
mkdir -p {bowtie2_flux_index_path}
{bowtie2_build_binary} -f {input.transcripts} {bowtie2_flux_index_path}/{bowtie2_index_prefix}
"""
rule run_bowtie2_flux_align:
input:
first = {flux_first_mate_path},
second = {flux_second_mate_path},
index = {bowtie2_flux_index_path}
output:
bowtie2_flux_sam
threads:
bowtie2_threads
shell:
"""
mkdir -p {bowtie2_flux_align_path}
{bowtie2_binary} -q -k 100 --threads {bowtie2_threads} --no-mixed --no-discordant -x {input.index}/{bowtie2_index_prefix} -1 {input.first} -2 {input.second} -S {bowtie2_flux_sam}
"""
rule run_bitseq_flux_parse:
input:
sam = {bowtie2_flux_sam},
transcripts = {flux_transcriptome_fullpath}
output:
bitseq_flux_prob_path
shell:
"""
mkdir -p {bitseq_flux_parse_path}
{bitseq_parse_binary} {input.sam} -o {bitseq_flux_prob_path} --trSeqFile {input.transcripts} --trInfoFile {bitseq_flux_trinfo_path} --uniform --verbose
"""
rule run_bitseq_flux_quant:
input:
bitseq_flux_prob_path
output:
bitseq_flux_quant_path
shell:
"""
mkdir -p {bitseq_flux_quant_path}
{bitseq_estimate_binary} -o {bitseq_flux_quant_path}/{bitseq_prefix} {bitseq_flux_prob_path}
"""
rule run_flux_pipeline:
input:
sailfish = sailfish_flux_quant_path,
kallisto = kallisto_flux_quant_path,
bitseq = bitseq_flux_quant_path
##################################################################################
# rsem rules
##################################################################################
rule run_sailfish_rsem_index:
input:
transcripts={rsem_transcriptome_fullpath}
output:
sailfish_rsem_index_path
threads:
sailfish_threads
shell:
"""
mkdir -p {sailfish_rsem_index_path}
{sailfish_binary} index -t {input.transcripts} -k 31 -o {sailfish_rsem_index_path} -p {sailfish_threads}
"""
rule run_sailfish_rsem_quant:
input:
first = {rsem_first_mate_path},
second = {rsem_second_mate_path},
index = {sailfish_rsem_index_path}
output:
sailfish_rsem_quant_path
threads:
sailfish_threads
shell:
"""
mkdir -p {sailfish_rsem_quant_path}
{sailfish_binary} quant -i {input.index} -l IU -1 {input.first} -2 {input.second} -p {sailfish_threads} -o {sailfish_rsem_quant_path} --useVBOpt
"""
rule run_kallisto_rsem_index:
input:
transcripts = {rsem_transcriptome_fullpath}
output:
kallisto_rsem_index_path
shell:
"""
mkdir -p {kallisto_rsem_index_path}
{kallisto_binary} index -i {kallisto_rsem_index_path}/{kallisto_rsem_index_prefix} -k 31 {input.transcripts}
"""
rule run_kallisto_rsem_quant:
input:
first = {rsem_first_mate_path},
second = {rsem_second_mate_path},
index = {kallisto_rsem_index_path}
output:
kallisto_rsem_quant_path
shell:
"""
mkdir -p {kallisto_rsem_quant_path}
{kallisto_binary} quant -i {kallisto_rsem_index_path}/{kallisto_rsem_index_prefix} -o {kallisto_rsem_quant_path} {input.first} {input.second}
"""
rule run_bowtie2_rsem_index:
input:
transcripts = {rsem_transcriptome_fullpath}
output:
bowtie2_rsem_index_path
threads:
bowtie2_threads
shell:
"""
mkdir -p {bowtie2_rsem_index_path}
{bowtie2_build_binary} -f {input.transcripts} {bowtie2_rsem_index_path}/{bowtie2_index_prefix}
"""
rule run_bowtie2_rsem_align:
input:
first = {rsem_first_mate_path},
second = {rsem_second_mate_path},
index = {bowtie2_rsem_index_path}
output:
bowtie2_rsem_sam
threads:
bowtie2_threads
shell:
"""
mkdir -p {bowtie2_rsem_align_path}
{bowtie2_binary} -q -k 100 --threads {bowtie2_threads} --no-mixed --no-discordant -x {input.index}/{bowtie2_index_prefix} -1 {input.first} -2 {input.second} -S {bowtie2_rsem_sam}
"""
rule run_bitseq_rsem_parse:
input:
sam = {bowtie2_rsem_sam},
transcripts = {rsem_transcriptome_fullpath}
output:
bitseq_rsem_prob_path
shell:
"""
mkdir -p {bitseq_rsem_parse_path}
{bitseq_parse_binary} {input.sam} -o {bitseq_rsem_prob_path} --trSeqFile {input.transcripts} --trInfoFile {bitseq_rsem_trinfo_path} --uniform --verbose
"""
rule run_bitseq_rsem_quant:
input:
bitseq_rsem_prob_path
output:
bitseq_rsem_quant_path
shell:
"""
mkdir -p {bitseq_rsem_quant_path}
{bitseq_estimate_binary} -o {bitseq_rsem_quant_path}/{bitseq_prefix} {bitseq_rsem_prob_path}
"""
rule run_rsem_pipeline:
input:
sailfish = sailfish_rsem_quant_path,
kallisto = kallisto_rsem_quant_path,
bitseq = bitseq_rsem_quant_path
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment