elsherbini/README.md

## README.md

      
    Raw
  

              README.md
            
          
    The three files I used to run interproscan 5.17 on a SLURM cluster.
It took input fasta files of bacterial genome contigs. It called ORFs using prodigal, and then used interproscan to annotate them.
To run the thing, update the config.yaml file and then submit the snakemake job:
sbatch snakemake.sbatch

  
## config.yaml
input_contig_directory:
    "/home/parevalo/scratch/scaffolded_genomes/contigs"
input_contig_file_extenstion:
    ".fa"
output_ORF_directory:
    "/nobackup1/josephe/projects/annotate_scaffolds/ORFs"
output_annotation_directory:
    "/nobackup1/josephe/projects/annotate_scaffolds/annotations"
interproscan_path:
    "/nobackup1/josephe/interproscan/interproscan-5.17-56.0/interproscan.sh"
interproscan_tempdir:
    "/nobackup1/josephe/annotate_everything/temp"
prodigal_path:
    "/home/parevalo/apps/bin/prodigal-2.6.3"

## Snakefile
configfile: "config.yaml"

import glob
import os


def _out_from_in(fn):
    base = os.path.splitext(os.path.basename(fn))[0]
    return os.path.join(config["output_annotation_directory"], base + '_cds_prod.faa.tsv')

INPUT_CONTIG_FILES = glob.glob(os.path.join(config["input_contig_directory"],"*" + config["input_contig_file_extenstion"]))
OUTPUT_FILES = [_out_from_in(fn) for fn in INPUT_CONTIG_FILES]

rule target:
     input: OUTPUT_FILES

rule annotate:
     input: os.path.join(config["output_ORF_directory"], "clean", "{name}_cds_prod.faa")
     output: os.path.join(config["output_annotation_directory"], "{name}_cds_prod.faa.tsv")
     shell: 'module load engaging/python/2.7.10; module load engaging/jdk/1.8.0_25; {config[interproscan_path]} -f tsv -f html -f xml -iprlookup -goterms -pathways -d {config[output_annotation_directory]} --tempdir {config[interproscan_tempdir]} -i {input};'

rule clean:
    input: os.path.join(config["output_ORF_directory"], "{name}_cds_prod.faa")
    output: os.path.join(config["output_ORF_directory"], "clean", "{name}_cds_prod.faa")
    shell: "sed 's/*//g' {input} > {output}"

rule orfs:
    input: os.path.join(config["input_contig_directory"], "{name}" + config["input_contig_file_extenstion"])
    output:
        fna=os.path.join(config["output_ORF_directory"], "{name}_cds_prod.fna"),
        faa=os.path.join(config["output_ORF_directory"], "{name}_cds_prod.faa")
    shell:
        "{config[prodigal_path]} -i {input} -d {output.fna} -a {output.faa}"

## snakemake.sbatch
#!/bin/bash

#SBATCH -p newnodes
#SBATCH -N 1
#SBATCH -n 1

module load engaging/python/2.7.11;
module load engaging/python/3.5.1;

snakemake --cluster "sbatch -p newnodes --mem=60000 -N 1 -n 16 " --jobs 150 --jobname "{rulename}.{jobid}"
	input_contig_directory:
	"/home/parevalo/scratch/scaffolded_genomes/contigs"
	input_contig_file_extenstion:
	".fa"
	output_ORF_directory:
	"/nobackup1/josephe/projects/annotate_scaffolds/ORFs"
	output_annotation_directory:
	"/nobackup1/josephe/projects/annotate_scaffolds/annotations"
	interproscan_path:
	"/nobackup1/josephe/interproscan/interproscan-5.17-56.0/interproscan.sh"
	interproscan_tempdir:
	"/nobackup1/josephe/annotate_everything/temp"
	prodigal_path:
	"/home/parevalo/apps/bin/prodigal-2.6.3"
	configfile: "config.yaml"

	import glob
	import os


	def _out_from_in(fn):
	base = os.path.splitext(os.path.basename(fn))[0]
	return os.path.join(config["output_annotation_directory"], base + '_cds_prod.faa.tsv')

	INPUT_CONTIG_FILES = glob.glob(os.path.join(config["input_contig_directory"],"*" + config["input_contig_file_extenstion"]))
	OUTPUT_FILES = [_out_from_in(fn) for fn in INPUT_CONTIG_FILES]

	rule target:
	input: OUTPUT_FILES

	rule annotate:
	input: os.path.join(config["output_ORF_directory"], "clean", "{name}_cds_prod.faa")
	output: os.path.join(config["output_annotation_directory"], "{name}_cds_prod.faa.tsv")
	shell: 'module load engaging/python/2.7.10; module load engaging/jdk/1.8.0_25; {config[interproscan_path]} -f tsv -f html -f xml -iprlookup -goterms -pathways -d {config[output_annotation_directory]} --tempdir {config[interproscan_tempdir]} -i {input};'

	rule clean:
	input: os.path.join(config["output_ORF_directory"], "{name}_cds_prod.faa")
	output: os.path.join(config["output_ORF_directory"], "clean", "{name}_cds_prod.faa")
	shell: "sed 's/*//g' {input} > {output}"

	rule orfs:
	input: os.path.join(config["input_contig_directory"], "{name}" + config["input_contig_file_extenstion"])
	output:
	fna=os.path.join(config["output_ORF_directory"], "{name}_cds_prod.fna"),
	faa=os.path.join(config["output_ORF_directory"], "{name}_cds_prod.faa")
	shell:
	"{config[prodigal_path]} -i {input} -d {output.fna} -a {output.faa}"
	#!/bin/bash

	#SBATCH -p newnodes
	#SBATCH -N 1
	#SBATCH -n 1

	module load engaging/python/2.7.11;
	module load engaging/python/3.5.1;

	snakemake --cluster "sbatch -p newnodes --mem=60000 -N 1 -n 16 " --jobs 150 --jobname "{rulename}.{jobid}"