tgirke/hmmscan.R

## hmmscan.R
##############################################
## Map Pfam Domains to Proteins with HMMER3 ##
##############################################
## Author: Thomas Girke
## Date: May 11, 2018

## Utility: mapping of Pfam domains to protein sequences.
## The module load and Pfam database paths given below are specific to the HPCC/biocluster system.
## For details consult the man page for hmmscan from the command-line with 'hmmscan -h'

library(systemPipeR)
moduleload("hmmer/3.1b2")
## Note: the file myseq.txt contains yours protein sequences and Pfam-A.hmm is the Pfam database available on HPCC cluster
system("hmmscan -E 0.1 --acc --domtblout output.pfam /srv/projects/db/pfam/2017-06-11-Pfam31.0/Pfam-A.hmm myseq.txt > /dev/null")

## Read Pfam mapping results into R
pfdf <- read.table("output.pfam", comment="#", header=FALSE)
## For fast import use fread from data.table package
library(data.table)
pfdf <- fread("grep -v '^#' output.pfam") # syntax to ignore lines starting with `#`
colnames(pfdf) <- c("target name", "accession1", "tlen", "query name", "accession2", "qlen", "E-value", "score1", "bias1", "No",  "of", "c-Evalue", "i-Evalue", "score2", "bias2", "from1", "to`", "from2", "to2", "from3", "to3",  "acc", "description of target")

## tibbles from dplyr are a more modern version of data.frames
library(dplyr, quietly=TRUE)
pfdf <- as_data_frame(pfdf)
pfdf
	##############################################
	## Map Pfam Domains to Proteins with HMMER3 ##
	##############################################
	## Author: Thomas Girke
	## Date: May 11, 2018

	## Utility: mapping of Pfam domains to protein sequences.
	## The module load and Pfam database paths given below are specific to the HPCC/biocluster system.
	## For details consult the man page for hmmscan from the command-line with 'hmmscan -h'

	library(systemPipeR)
	moduleload("hmmer/3.1b2")
	## Note: the file myseq.txt contains yours protein sequences and Pfam-A.hmm is the Pfam database available on HPCC cluster
	system("hmmscan -E 0.1 --acc --domtblout output.pfam /srv/projects/db/pfam/2017-06-11-Pfam31.0/Pfam-A.hmm myseq.txt > /dev/null")

	## Read Pfam mapping results into R
	pfdf <- read.table("output.pfam", comment="#", header=FALSE)
	## For fast import use fread from data.table package
	library(data.table)
	pfdf <- fread("grep -v '^#' output.pfam") # syntax to ignore lines starting with `#`
	colnames(pfdf) <- c("target name", "accession1", "tlen", "query name", "accession2", "qlen", "E-value", "score1", "bias1", "No", "of", "c-Evalue", "i-Evalue", "score2", "bias2", "from1", "to`", "from2", "to2", "from3", "to3", "acc", "description of target")

	## tibbles from dplyr are a more modern version of data.frames
	library(dplyr, quietly=TRUE)
	pfdf <- as_data_frame(pfdf)
	pfdf