Skip to content

Instantly share code, notes, and snippets.

@LeeBergstrand
Last active March 1, 2022 09:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LeeBergstrand/60d3cc72e57008cb42a3 to your computer and use it in GitHub Desktop.
Save LeeBergstrand/60d3cc72e57008cb42a3 to your computer and use it in GitHub Desktop.
dbCAN_HMMSCAN_Parser_Problem
Cyp125(GramPos) 408

dbCAN Hmmscan Parser Issue

I was recently reverse engineering dbCAN's shell/perl script (hmmscan-parser.sh) for parsing HMMER's hmmscan human readable text results. Unfortunately, while figuring out how this script works I found an error.

#####Orignal Script:

	#!/usr/bin/env sh
	# Yanbin Yin
	# 08/18/2011
	# hmmscan output parser
	# Usage: sh hmmscan-parser.sh hmmscan-output-file

	# 1. take hmmer3 output and generate the tabular output
	# 2. sort on the 6th and 7th cols
	# 3. remove overlapped/redundant hmm matches and keep the one 	with the lower e-values
	# 4. calculate the covered fraction of hmm (make sure you have 	downloaded the "all.hmm.ps.len" file to the same directory of 	this perl script)
	\# 5. apply the E-value cutoff and the covered faction cutoff
	
	cat $1 | perl -e 'while(<>){if(/^\/\//){$x=join("",@a);($q)=($x=~/^Query:\s+(\S+)/m);while($x=~/^>> (\S+.*?\n\n)/msg){$a=$&;@c=split(/\n/,$a);$c[0]=~s/>> //;for($i=3;$i<=$#c;$i++){@d=split(/\s+/,$c[$i]);print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;}}@a=();}else{push(@a,$_);}}' \
	| sort -k 1,1 -k 6,6n -k 7,7n | uniq \
	| perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[0]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[2]<$c[2]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}' \
	| uniq | perl -e 'open(IN,"all.hmm.ps.len");while(<IN>){chomp;@a=split;$b{$a[0]}=$a[1];}while(<>){chomp;@a=split;$r=($a[4]-$a[3])/$b{$a[1]};print $_."\t".$r."\n";}' \
	| perl -e 'while(<>){@a=split(/\t/,$_);if(($a[-1]-$a[-2])>80){print $_ if $a[2]<1e-5;}else{print $_ if $a[2]<1e-3;}}' | awk '$NF>0.3'

During the reverse engineering process the first thing to do was to expand out the perl so it was human readable. I then proceeded to annotate it.

#####Expanded Perl:

	# cat $1 | perl -e 
	while(<>)
	{
		if(/^\/\//)
		{
			$x=join("",@a);
			($q)=($x=~/^Query:\s+(\S+)/m); # Grabs query name
			
			while($x=~/^>> (\S+.*?\n\n)/msg) # multiline, "." matches newline, global
			{
				$a=$&;
				@c=split(/\n/,$a);
				$c[0]=~s/>> //;
				for($i=3;$i<=$#c;$i++)
				{
					@d=split(/\s+/,$c[$i]);
					print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;
				}
			}
			@a=();
		}
		else
		{
			push(@a,$_);
		}
	}
	 
	#| sort -k 1,1 -k 6,6n -k 7,7n | uniq \| perl -e
	while(<>)
	{
		chomp;
		@a=split;
		next if $a[-1]==$a[-2];
		push(@{$b{$a[0]}},$_); # Removes lines with same start and end
	}
	 
	foreach(sort keys %b) 
	{
	        @a=@{$b{$_}};
	 
	        for($i = 0; $i < $#a; $i++) 
			{
	                @b = split(/\t/, $a[$i]);   # Alignment 1 = top alignment
	                @c = split(/\t/, $a[$i+1]); # Alignment 2 = bottom alignment
	                $len1 = $b[-1] - $b[-2]; # Length one = alignment 1 end - aligment 1 start
	                $len2 = $c[-1] - $c[-2]; # Length two = alignment 2 end - aligment 2 start
	                $len3 = $b[-1] - $c[-2]; # Length one = aligment 1 end - alignment 2 start
	 
	                if($len3 > 0 and ($len3 / $len1 > 0.5 or $len3 / $len2 > 0.5)) # if alignments are overlaped and the overlap is greater than 50% the length of an alignment. 
	                {
	                        if($b[2] < $c[2]) # Checks E value. Removes the alignment with the lowest Evalue.
	                        {
	                                splice(@a, $i + 1, 1);
	                        }
	                        else
	                        {
	                                splice(@a, $i, 1);
	                        }
	                        $i = $i - 1;
	                }
	        }
	        foreach(@a) 
			{
	                print $_ . "\n";
	        }
	}
	 
	# | uniq | perl -e 
	open(IN,"all.hmm.ps.len");
	while(<IN>)
	{
		chomp;
		@a=split;
		$b{$a[0]}=$a[1]; # creates hash of hmmName : hmmLength
	}
	 
	while(<>)
	{
		chomp;
		@a=split;
		$r=($a[4]-$a[3])/$b{$a[1]}; # $a[4] = hmm end $a[3] = hmm start ; $b{$a[1]} = result of the hash of the name of the hmm (hmm length).
		print $_."\t".$r."\n";
	}
		
	# | perl -e 
	while(<>)
	{
		@a=split(/\t/,$_);
		if(($a[-1]-$a[-2])>80)
		{
			print $_ if $a[2]<1e-5;
		}
		else
		{
			print $_ if $a[2]<1e-3;
		}
	}
	# awk '$NF>0.3' # Deletes alignment coverages less than 1/3

The first While(<>) parses the domain alignments from the human readable hmmscan output. The next section of Perl removes overlapped/redundant HMM matches and keeps the matches with the lower E-values. After this filtering step, a section of Perl calculates the covered fraction of the HMM and the final Perl step applies an E-value cutoff.

####The Problem

I believe the author made a mistake in the last section of Perl in the script. The section below:

	# | perl -e 
	while(<>)
	{
	    @a=split(/\t/,$_);
	    if(($a[-1]-$a[-2])>80) # Bad line!
	    {
	        print $_ if $a[2]<1e-5;
	    }
	    else
	    {
	        print $_ if $a[2]<1e-3;
	    }
	}

The section of Perl that comes before the code above calculates the HMM coverage and produces an output like the following:

Prot.ID		HMM.Model			EValue		HMM.To	HMM.Cover Ali.From	Ali.To	HMM.From	
---------------------------------------------------------------------------------------------
YP_700371.1	Cyp125(GramPos)  	2.3e-55		5			389		17			393		0.941176470588235
YP_700387.1	Cyp125(GramPos)  	2.7e-61		5			407		11			406		0.985294117647059
YP_700417.1	Cyp125(GramPos)  	1.6e-57		8			404		12			401		0.970588235294118

As you can see, the last column is the HMM coverage and the second to last column is where the alignment ends on the HMM. Therefore, in reverse array syntax, $a[-2] = Ali.To and $a[-1] = HMM.Cover.

I wrote a small Perl script that follows the same algorithm as the last section of Perl in the shell script. This script can be run instead of the last section of Perl in the shell script and creates an annotated output like this:

a[-2] = 393
a[-1] = 0.941176470588235
(a[-1]-a[-2]) = -392.058823529412
a[-1]-a[-2]>80 = FALSE

Prot.ID		HMM.Model			EValue		HMM.From	HMM.To	Ali.From	Ali.To	HMM.Cover
-----------------------------------------------------------------------------------------------------
YP_700371.1	Cyp125(GramPos)  	2.3e-55		5			389		17			393		0.941176470588235

According to the author's current algorithm, the HMM coverage (a[-1]) will always be less than or equal to one, since it was created by dividing the HMM alignment length by the HMM length. Ali.To (a[-2]) will always be greater than one. As a result, a[-1]-a[-2] will always be a negative number and a[-1]-a[-2]>80 will always be FALSE. The program will default to the E-Value cutoff found in the ELSE statement (1e-3).

I believe the author was originally intending to use the length of the alignment to determine the strictness of the E-value filter and the author forgot that he/she added an additional column in the previous step. Therefore, the check condition inside the if statement should be $a[-2]-$a[-3])>80 ((Ali.To - Ali.From) > 80) rather than $a[-1]-$a[-2])>80 ((HMM.cover - Ali.To) > 80).

Therefore the last section of Perl should look like the following:

	# | perl -e 
	while(<>)
	{
	    @a=split(/\t/,$_);
	    if(($a[-2]-$a[-3])>80) # Good line!
	    {
	        print $_ if $a[2]<1e-5;
	    }
	    else
	    {
	        print $_ if $a[2]<1e-3;
	    }
	}

In the version above, if the length of the alignment is greater than 80 then a stricter E-Value cut off is used (1e-5). However, if the length of the HMM alignment is less than 80 then a looser E-Value cut off is used (1e-3). This is how I believe the author intended the script to work.

Output from my small Perl script with updated algorithm:

Corner Cases:

$a[-2]-$a[-3])>80 is TRUE but the E-Value is less than 1e-5 (Row is retained):

a[-3] = 17
a[-2] = 393
(a[-2]-a[-3]) = 376
a[-2]-a[-3]>80 = TRUE

Prot.ID		HMM.Model			EValue		HMM.From	HMM.To	Ali.From	Ali.To	HMM.Cover
-----------------------------------------------------------------------------------------------------
YP_700371.1	Cyp125(GramPos)  	2.3e-55		5			389		17			393		0.941176470588235

$a[-2]-$a[-3])>80 is FALSE but the E-Value is less than 1e-3 (Row is retained):

a[-3] = 231
a[-2] = 293
(a[-2]-a[-3]) = 62
a[-2]-a[-3]>80 = FALSE

Prot.ID		HMM.Model			EValue		HMM.From	HMM.To	Ali.From	Ali.To	HMM.Cover
-----------------------------------------------------------------------------------------------------
YP_703037.1	Cyp125(GramPos)  	4.8e-06		213			275		231			293		0.151960784313725

$a[-2]-$a[-3])>80 is FALSE but the E-Value is greater than or equal to 1e-3 (Algorithm does not print row):

a[-3] = 107
a[-2] = 137
(a[-2]-a[-3]) = 30
a[-2]-a[-3]>80 = FALSE

Domain alignment be deleted.
E-Value >= 1e-3

Prot.ID		HMM.Model			EValue		HMM.From	HMM.To	Ali.From	Ali.To	HMM.Cover
-----------------------------------------------------------------------------------------------------

$a[-2]-$a[-3])>80 is TRUE but the E-Value is greater than or equal to 1e-5 (Algorithm does not print row):

a[-3] = 196
a[-2] = 278
(a[-2]-a[-3]) = 82
a[-2]-a[-3]>80 = TRUE

Domain alignment should be deleted.
E-Value >= 1e-5

Prot.ID		HMM.Model			EValue		HMM.From	HMM.To	Ali.From	Ali.To	HMM.Cover
-----------------------------------------------------------------------------------------------------

####Conclusion:

There is an error in the last section of Perl in dbCAN's shell/perl script (hmmscan-parser.sh) for parsing HMMER's hmmscan human readable text results. This error causes the parser to default to the E-Value cutoff of 1e-3 for all domain alignments. The result is that some HMM alignments with a length greater than 80 are not being filtred out by the stricter E-Value cutoff of 1e-5. Overall, some alignments that were supposed to be eliminated are being retained.

#!/usr/bin/env perl
while(<>)
{
@a=split(/\t/,$_);
print "\n======================================================================================================\n";
print "a[-2] = $a[-2] \n";
print "a[-1] = $a[-1]";
print "(a[-1]-a[-2]) = ".($a[-1]-$a[-2])." \n";
if(($a[-1]-$a[-2])>80)
{
print "a[-1]-a[-2]>80 = TRUE \n\n";
if(not ($a[2]<1e-5))
{
print "Domain alignment should be deleted.\n";
print "E-Value >= 1e-5\n\n";
}
}
else
{
print "a[-1]-a[-2]>80 = FALSE \n\n";
if(not ($a[2]<1e-3))
{
print "Domain alignment should be deleted.\n";
print "E-Value >= 1e-3\n\n";
}
}
print "Prot.ID\t\tHMM.Model\t\tEValue\tHMM.From HMM.To\tAli.From Ali.To\tHMM.Cover\n";
print "-----------------------------------------------------------------------------------------------------\n";
if(($a[-1]-$a[-2])>80)
{
print $_ if $a[2]<1e-5;
}
else
{
print $_ if $a[2]<1e-3;
}
}
#!/usr/bin/env perl
while(<>)
{
@a=split(/\t/,$_);
print "\n======================================================================================================\n";
print "a[-3] = $a[-3] \n";
print "a[-2] = $a[-2] \n";
print "(a[-2]-a[-3]) = ".($a[-2]-$a[-3])." \n";
if(($a[-2]-$a[-3])>80)
{
print "a[-2]-a[-3]>80 = TRUE \n\n";
if(not ($a[2]<1e-5))
{
print "Domain alignment should be deleted.\n";
print "E-Value >= 1e-5\n\n";
}
}
else
{
print "a[-2]-a[-3]>80 = FALSE \n\n";
if(not ($a[2]<1e-3))
{
print "Domain alignment should be deleted.\n";
print "E-Value >= 1e-3\n\n";
}
}
print "Prot.ID\t\tHMM.Model\t\tEValue\tHMM.From HMM.To\tAli.From Ali.To\tHMM.Cover\n";
print "-----------------------------------------------------------------------------------------------------\n";
if(($a[-2]-$a[-3])>80)
{
print $_ if $a[2]<1e-5;
}
else
{
print $_ if $a[2]<1e-3;
}
}
# Expanded Perl (This perl does not work!)
# cat $1 | perl -e
while(<>)
{
if(/^\/\//)
{
$x=join("",@a);
($q)=($x=~/^Query:\s+(\S+)/m); # Grabs query name
while($x=~/^>> (\S+.*?\n\n)/msg) # mutiline, "." matches newline, global
{
$a=$&;
@c=split(/\n/,$a);
$c[0]=~s/>> //;
for($i=3;$i<=$#c;$i++)
{
@d=split(/\s+/,$c[$i]);
print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;
}
}
@a=();
}
else
{
push(@a,$_);
}
}
#| sort -k 1,1 -k 6,6n -k 7,7n | uniq \| perl -e
while(<>)
{
chomp;
@a=split;
next if $a[-1]==$a[-2];
push(@{$b{$a[0]}},$_); # Removes lines with same start and end
}
foreach(sort keys %b)
{
@a=@{$b{$_}};
for($i = 0; $i < $#a; $i++)
{
@b = split(/\t/, $a[$i]); # Alignment 1 = top alignment
@c = split(/\t/, $a[$i+1]); # Alignment 2 = bottom alignment
$len1 = $b[-1] - $b[-2]; # Length one = aligment 1 end - aligment 1 start
$len2 = $c[-1] - $c[-2]; # Length two = aligment 2 end - aligment 2 start
$len3 = $b[-1] - $c[-2]; # Length one = aligment 1 end - aligment 2 start
if($len3 > 0 and ($len3 / $len1 > 0.5 or $len3 / $len2 > 0.5)) # if alignments are overlaped and the overlap is greater than 50% the length of an alignment.
{
if($b[2] < $c[2]) # Checks E value. Removes the alignment with the lowest Evalue.
{
splice(@a, $i + 1, 1);
}
else
{
splice(@a, $i, 1);
}
$i = $i - 1;
}
}
foreach(@a)
{
print $_ . "\n";
}
}
# | uniq | perl -e
open(IN,"all.hmm.ps.len");
while(<IN>)
{
chomp;
@a=split;
$b{$a[0]}=$a[1]; # creates hash of hmmName : hmmLength
}
while(<>)
{
chomp;
@a=split;
$r=($a[4]-$a[3])/$b{$a[1]}; # $a[4] = hmm end $a[3] = hmm start ; $b{$a[1]} = result of the hash of the name of the hmm (hmm length).
print $_."\t".$r."\n";
}
# | perl -e
while(<>)
{
@a=split(/\t/,$_);
if(($a[-1]-$a[-2])>80)
{
print $_ if $a[2]<1e-5;
}
else
{
print $_ if $a[2]<1e-3;
}
}
# awk '$NF>0.3' # Deletes alignment coverages less than 1/3
#!/usr/bin/env sh
# This is the original script and is still faulty! Do not use!
# Yanbin Yin
# 08/18/2011
# hmmscan output parser
# Usage: sh hmmscan-parser.sh hmmscan-output-file
# 1. take hmmer3 output and generate the tabular output
# 2. sort on the 6th and 7th cols
# 3. remove overlapped/redundant hmm matches and keep the one with the lower e-values
# 4. calculate the covered fraction of hmm (make sure you have downloaded the "all.hmm.ps.len" file to the same directory of this perl script)
# 5. apply the E-value cutoff and the covered faction cutoff
cat $1 | perl -e 'while(<>){if(/^\/\//){$x=join("",@a);($q)=($x=~/^Query:\s+(\S+)/m);while($x=~/^>> (\S+.*?\n\n)/msg){$a=$&;@c=split(/\n/,$a);$c[0]=~s/>> //;for($i=3;$i<=$#c;$i++){@d=split(/\s+/,$c[$i]);print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;}}@a=();}else{push(@a,$_);}}' \
| sort -k 1,1 -k 6,6n -k 7,7n | uniq \
| perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[0]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[2]<$c[2]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}' \
| uniq | perl -e 'open(IN,"all.hmm.ps.len");while(<IN>){chomp;@a=split;$b{$a[0]}=$a[1];}while(<>){chomp;@a=split;$r=($a[4]-$a[3])/$b{$a[1]};print $_."\t".$r."\n";}' \
| perl -e 'while(<>){@a=split(/\t/,$_);if(($a[-1]-$a[-2])>80){print $_ if $a[2]<1e-5;}else{print $_ if $a[2]<1e-3;}}' | awk '$NF>0.3'
@dburkhardt
Copy link

Thank you for making note of this mistake! I'm not used to using Perl, would you please explain how a newbie would implement this change? I was under the impression that using perl -e required the following code to be on a single line, but your fixes are expanded. Maybe consider posting an updated version of the whole script with your fix that works as the original does (Usage: hmmscan-parser.fixed.sh hmmscan-output-file).

Thanks again and good job for finding this!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment