Skip to content

Instantly share code, notes, and snippets.

@jrjhealey
Created July 5, 2017 21:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jrjhealey/d2c9232bd7fc70d16f8939d6a6f58f88 to your computer and use it in GitHub Desktop.
Save jrjhealey/d2c9232bd7fc70d16f8939d6a6f58f88 to your computer and use it in GitHub Desktop.
Turn the output of HHsearch in to tab delimited text
# -*- coding: utf-8 -*-
"""
This script takes the .hhr files output by HHSuite and
turns the quite verbose file in to a fully tabulated
version with all the fields separated one, one line per
file. Thus, the file can be viewed simply in Excel etc.
It requires the non-standard pandas module.
"""
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# The license for HHSuite remains with the authors. Please consult
# their licensce agreements before using this software, and ensure
# they are cited for any use. Run the script with -b to print rel-
# evant references.
import os
import sys
import traceback
import warnings
__author__ = "Joe R. J. Healey"
__version__ = "1.0.0"
__title__ = "tabulateHHpred"
__license__ = "GPLv3"
__author_email__ = "J.R.J.Healey@warwick.ac.uk"
template = \
u"""
---|------|------------------------|----|-------|-------|------|-----|----|---------|--------------|
No Hit Short Desc Prob E-value P-value Score SS Cols Query HMM Template HMM
"""
def hhparse(hhresult_file, verbose):
'''Convert HHpred's text-based output table in to a pandas dataframe'''
from io import StringIO
import pandas as pd
pattern = StringIO(template).readlines()[1]
colBreaks = [i for i, ch in enumerate(pattern) if ch == '|']
widths = [j-i for i, j in zip( ([0]+colBreaks)[:-1], colBreaks ) ]
hhtable = pd.read_fwf(hhresult_file, skiprows=8, nrows=10, header=0, widths = widths)
if verbose is True:
print(hhtable)
top_hit = str(hhtable.loc[0,'Hit'])[0:4]
top_hit_full = hhtable.loc[0,'Hit']
top_prob = hhtable.loc[0,'Prob']
top_eval = hhtable.loc[0,'E-value']
top_pval = hhtable.loc[0,'P-value']
top_score = hhtable.loc[0,'Score']
if verbose is True:
print("Your best hit: (PDB ID | Probability | E-Value | P-Value | Score)")
print("\t" + str(top_hit) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) )
return top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score
def getFullDesc(hhresult_file,top_hit_full, verbose):
with open(hhresult_file, 'r') as hrh:
for line in hrh:
if line.startswith('>' + top_hit_full):
full_desc = line.rstrip('\n')
if verbose is True:
print full_desc
return full_desc
def getDOI(top_hit):
"""Query the PDB REST API to get an associated DOI/Publication"""
import requests
try:
query = requests.get("https://www.ebi.ac.uk/pdbe/api/pdb/entry/publications/" + str(top_hit))
qjson = query.json()
doi = qjson[top_hit][0]['doi']
if not doi:
doi = "No DOI found."
except KeyError:
doi = "Key error. ID likely deprecated."
return doi
def displayRefs():
"""Display relevant references"""
print('''
- The following publications pertain to HHSuite:
- Alva V., Nam SZ., Söding J., Lupas AN. (2016)
The MPI bioinformatics Toolkit as an integrative platform for advanced protein sequence and structure analysis.
Nucleic Acids Res. pii: gkw348. PMID: 27131380
- Remmert M., Biegert A., Hauser A., Söding J. (2011) HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
Nat Methods. 9(2):173-5. doi: 10.1038/nmeth.1818. PMID: 22198341
- Söding J., Biegert A., Lupas AN. (2005) The HHpred interactive server for protein homology detection and structure prediction.
Nucleic Acids Res 33 (Web Server issue), W244-W248. PMID: 15980461
- Söding J. (2005) Protein homology detection by HMM-HMM comparison.
Bioinformatics 21: 951-960. PMID: 15531603
- Hildebrand A., Remmert A., Biegert A., Söding J. (2009) Fast and accurate automatic structure prediction with HHpred.
Proteins 77(Suppl 9):128-32. doi: 10.1002/prot.22499. PMID: 19626712
- Meier A., Söding J. (2015) Automatic Prediction of Protein 3D Structures by Probabilistic Multi-template Homology Modeling.
PLoS Comput Biol. 11(10):e1004343. doi: 10.1371/journal.pcbi.1004343. PMID: 26496371
''')
sys.exit(1)
def parseArgs():
"""Parse commandline arguments"""
import argparse
try:
parser = argparse.ArgumentParser(description='This script converts HHpreds verbose output in to a full table for subsequent analysis.')
parser.add_argument(
'-i',
'--infile',
action='store',
help='The HHpred output file to parse.')
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='Print additional messages to screen. Default behaviour is false, only the result would be printed to screen for piping etc.')
parser.add_argument(
'-o',
'--outfile',
action='store',
default='None',
help='Output file name to store results in. If none provided, the default will be infile.tsv.')
parser.add_argument(
'-b',
'--bibliography',
action='store_true',
help='Display references.')
parser.add_argument(
'-d',
'--doi',
action='store_true',
help='Try to retrieve relevant publications from PDB RESTful API. Warning, this will significantly slow down the processing of results.')
args = parser.parse_args()
except:
print "An exception occured with argument parsing. Check your provided options."
traceback.print_exc()
return parser.parse_args()
def main():
"""Make function calls and handle arguments for tabulateHHpred.py"""
args = parseArgs()
if args.bibliography is True:
displayRefs()
verbose = args.verbose
hhresult_file = args.infile
indir = os.path.dirname(os.path.abspath(hhresult_file))
split = os.path.splitext(args.infile)
basename = os.path.basename(split[0])
if args.outfile is 'None':
outfile = indir + '/' + basename + '.tsv'
else:
outfile = args.outfile
# Main code begins:
top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score = hhparse(hhresult_file, verbose)
full_desc = getFullDesc(hhresult_file, top_hit_full, verbose)
row = (basename + "\t" + str(top_hit) + "\t" + str(top_hit_full) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) + "\t" + full_desc + "\n")
if args.doi is True:
doi = getDOI(top_hit)
row = (row.rstrip("\n") + "\t" + doi + "\n")
with open(outfile, 'w') as ofh:
ofh.write(row)
print(row)
elif args.doi is False:
with open(outfile, 'w') as ofh:
ofh.write(row)
print(row)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment