Created
January 21, 2020 10:05
-
-
Save mwalzer/1e49a6d18509d030a24a7f18574dbebb to your computer and use it in GitHub Desktop.
Old qcml script for looking into search engine performance.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
__author__ = 'walzer' | |
import sys | |
import argparse | |
import pyopenms as oms | |
import pandas as pd | |
from os import path | |
VERSION = "0.2" | |
def __main__(): | |
parser = argparse.ArgumentParser(version=VERSION) | |
parser.add_argument('-in', dest="inf", help='<Required> full path to the input idXML', required=True) | |
parser.add_argument('-out', dest="out", help="<Required> full path to the csv to be written", required=True) | |
parser.add_argument('-doi', '--dept_of_immuno-filenames', dest='doi', action='store_true', help="If filename conforms with the filename scheme of the dept. of immunology... (in doubt, it probably doesn'T)") | |
parser.set_defaults(doi=False) | |
parser.add_argument('-qcML', dest="qcml", action='store_true', help="If instead of writing a csv embedding in a qcml file - prerequisite: -out must be a existing qcml file") | |
parser.set_defaults(qcml=False) | |
options = parser.parse_args() | |
if len(sys.argv) <= 1: | |
parser.print_help() | |
sys.exit(1) | |
if not (options.inf or options.out): | |
parser.print_help() | |
sys.exit(1) | |
pros = list() | |
peps = list() | |
f = oms.IdXMLFile() | |
f.load(options.inf, pros, peps) | |
if options.qcml: | |
try: | |
qf = oms.QcMLFile() | |
qf.load(oms.String(options.out)) | |
except: | |
print "no usable qcml file given" | |
sys.exit(1) | |
SE = "NA" | |
SE = pros[0].getSearchEngine() | |
sesmv = "NA" | |
if SE == 'MS-GF+': | |
sesmv = 'MS:1002053' | |
elif SE == 'XTandem': | |
sesmv = 'E-Value' | |
elif SE == 'Comet': | |
sesmv = 'MS:1002257' | |
elif SE == 'Mascot': | |
sesmv = 'EValue' | |
if not sesmv: | |
print "no adequate search engine score found" | |
#sys.exit(1) | |
organ, repli, patient = None, None, None | |
if options.doi: | |
organ = path.basename(options.inf).split('_')[3] | |
repli = path.basename(options.inf).split('_')[6].split('#')[-1] | |
patient = path.basename(options.inf).split('_')[2] | |
if not organ or not repli or not patient: | |
options.doi = False | |
sepe = list() | |
nopep = False | |
for pep in peps: | |
pep.sort() | |
hits = pep.getHits() | |
st = pep.getScoreType() | |
if st != 'Posterior Error Probability' and not nopep: | |
print "Warning, no PEP previously calculated? (Found no score type matching.)" | |
nopep = True | |
if pep.metaValueExists('spectrum_reference'): | |
sn_ref = pep.getMetaValue('spectrum_reference') | |
sn = sn_ref.split('=')[-1] | |
else: | |
sn = '?' | |
sn_ref = '?' | |
for i, h in enumerate(hits): | |
row = dict() | |
row['file'] = path.basename(options.inf) | |
row['sequence'] = h.getSequence().toUnmodifiedString() | |
row['modified'] = h.getSequence().isModified() | |
row['rank'] = i + 1 | |
row['spectrum'] = sn | |
row['spectrum_reference'] = sn_ref # match with spectrum_native_id von featureXML and spectrum_reference from mapped PeptideIdentifications | |
if pep.getScoreType() == 'Posterior Error Probability': | |
row['PEP'] = h.getScore() | |
else: | |
row['PEP'] = "NA" | |
row['SE'] = SE | |
if h.metaValueExists(sesmv): | |
row['SEscore'] = h.getMetaValue(sesmv) | |
if st == 'q-value': | |
row['qvalue'] = h.getScore() | |
else: | |
if h.metaValueExists('q-value_score'): | |
row['qvalue'] = h.getMetaValue('q-value_score') | |
if h.metaValueExists('binder'): | |
if h.metaValueExists('weak'): | |
row['binder'] = 'weak' | |
elif h.metaValueExists('strong'): | |
row['binder'] = 'strong' | |
else: | |
row['binder'] = 'yes' | |
row['allele'] = h.getMetaValue("binder") | |
else: | |
row['binder'] = 'no' | |
if "decoy" not in h.getMetaValue("target_decoy"): | |
row['target'] = 'target' | |
else: | |
row['target'] = 'decoy' | |
if options.doi: | |
row['organ'] = organ | |
row['replicate'] = repli | |
row['patient'] = patient | |
sepe.append(row) | |
outframe = pd.DataFrame(sepe) | |
if options.qcml: | |
idxa = oms.Attachment() | |
idxa.name = "generic table" #"extended id tab" | |
idxa.cvRef = "qcML" | |
idxa.cvAcc = "QC:0000049" | |
idxa.qualityRef = "QC:0000025" | |
idxa.colTypes = list(outframe.columns) | |
rows = [[str(row[f]) for f in idxa.colTypes] for ind, row in outframe.iterrows()] #writes nan if cell not filled | |
idxa.tableRows = rows | |
ls = list() | |
qf.getRunNames(ls) | |
qf.addRunAttachment(ls[0], idxa) | |
#qf.store(oms.String(options.out.replace('%', 'perc'))) | |
qf.store(oms.String(options.out)) | |
else: | |
outframe.to_csv(options.out, sep='\t', index=False) | |
if __name__ == '__main__': | |
__main__() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ypriverol for mzid just switch
IdXMLFile
withMzIdentMLFile