Skip to content

Instantly share code, notes, and snippets.

@fanjin-z
Created May 10, 2019 22:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fanjin-z/0830b61dc7550e9e86acc8056d4cc173 to your computer and use it in GitHub Desktop.
Save fanjin-z/0830b61dc7550e9e86acc8056d4cc173 to your computer and use it in GitHub Desktop.
De Novo peptide sequencing algorithm (python)
'''
MIT License
Copyright (c) 2019 Fanjin Zeng
This work is licensed under the terms of the MIT license, see <https://opensource.org/licenses/MIT>.
'''
class Peak():
'''
Nodes that store peak info
'''
def __init__(self, mass, intensity):
self.mass = mass
self.intensity = intensity
self.valid = False
self.prev = []
self.sequence = ''
self.score = - float('inf')
def build_graph(spectrum, mass2amino):
'''
Edges: all possible amino acid mass differences between peaks
Nodes: Mass of peaks
'''
mass2peak = {}
idx2peak = {}
for idx, (mass, intensity) in enumerate(sorted(spectrum.items())):
peak = Peak(mass, intensity)
mass2peak[mass] = peak
idx2peak[idx] = peak
for a in mass2amino.keys():
try:
mass2peak[mass-a]
peak.prev.append((mass2amino[a], mass2peak[mass-a]))
except:
pass
return mass2peak, idx2peak
def mark_path2end(peak):
'''Recursive marks peaks that connect to the measure mass peak'''
if peak.valid:
return
peak.valid = True
for ch, p in peak.prev:
mark_path2end(p)
def mark_all(mass2peak):
'''Mark all peak as valid'''
for mass, peak in mass2peak.items():
peak.valid = True
def find_sequence(measure_mass, mass2peak, offset, use_yion):
'''Find valid sequence by de novo algorithm'''
y_mass = measure_mass - offset + 1
if use_yion and y_mass in mass2peak:
mass2peak[offset].score += mass2peak[y_mass].intensity
for mass,peak in sorted(mass2peak.items()):
if peak.valid:
score = - float('inf')
best_prev = None
best_ch = None
for ch, p in peak.prev:
if p.score > score:
score = p.score
best_prev = p
best_ch = ch
if best_prev is not None:
peak.score = score + peak.intensity
peak.sequence = best_prev.sequence + best_ch
y_mass = measure_mass - mass + 1
if use_yion and y_mass in mass2peak:
peak.score += mass2peak[y_mass].intensity
def check_spectrum(spectrum, offset, end_peak):
'''Check if H and measure mass supposed peaks in spectrum '''
if offset not in spectrum:
spectrum[offset] = 0
if end_peak not in spectrum:
spectrum[end_peak] = 0
def fill_spectrum(spectrum, offset, end_peak):
'''Fill gaps in spectrum peaks'''
for mass in range(offset, end_peak+1):
if mass not in spectrum:
spectrum[mass] = 0
def init_score(mass2peak, offset, subseq):
'''Set initial score'''
if subseq in ['both', 'right']:
for mass, peak in mass2peak.items():
peak.score = peak.intensity
else:
mass2peak[offset].score = mass2peak[offset].intensity
def de_novo(measure_mass, spectrum, mass2amino, use_yion=False, enable_missing_peak=False, subseq=None):
'''
Run De Novo algorithm
measure_mass: INT, unfragmented y-ion mass.
spectrum: DICT(INT: INT), mass: intensity
mass2amino: DICT(String: INT), amino acid character: mass
use_yion: Boolean, If consider y-ion intensity.
enable_missing_peak: Boolean, If consider missing peaks
subseq: One of [None, 'both', 'left', 'right']
'''
offset = 1
end_peak = measure_mass - 19 + offset
check_spectrum(spectrum, offset, end_peak)
if enable_missing_peak: fill_spectrum(spectrum, offset, end_peak)
mass2peak, idx2peak = build_graph(spectrum, mass2amino)
if subseq in ['both', 'left']:
mark_all(mass2peak)
else:
peak = mass2peak[end_peak]
mark_path2end(peak)
init_score(mass2peak, offset, subseq)
find_sequence(measure_mass, mass2peak, offset, use_yion)
return mass2peak
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment