Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/86bbace828e5234a223280729cb14d08 to your computer and use it in GitHub Desktop.
Save adiamb/86bbace828e5234a223280729cb14d08 to your computer and use it in GitHub Desktop.
MS_Single_Mutations_code_trial
###PYTHON#############
#Purpose : Replication of MS spectra analysis
#Author : Aditya Ambati
#date : May 14 2017
#update : version 1
#####import libraries
from itertools import chain
import pandas as pd
import numpy as np
%matplotlib
import re
spectra = pd.read_csv('AFLSA208A_PAN_TRYPSIN.csv', header = 0, sep = ";") ## read in the file
### take only selected columns
##
#array(['Query #:z', 'Protein\nRank','Peptide\n< ProteinMetrics Confidential >', 'Glycans\nNHFAGNa','Modification Type(s)', 'Observed\nm/z', 'z', 'Observed\n(M+H)',
#'Calc.\nmass (M+H)', 'Off-by-x\nerror', 'Mass error\n(ppm)',
#'Starting\nposition', 'Cleavage', 'Score', 'Delta', 'Delta\nMod',
#'|Log Prob|', '# of unique\npeptides', 'Protein Name',
# 'Protein\nDB number', 'Comment', 'Scan #', 'Scan Time'], dtype=object)
## subset the df
spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']]
## rename the columns
spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name']
## define a function to return only alphabets
def peponly(pepseq):
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide
spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep]
untagged=[]
for i in range(len(spectra2.pep)):
line = spectra2.pep[i]
untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character
spectra2.loc[:, ('tag_rem_pep')] = untagged ## add this into the df
spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep]
spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int)
spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length
spectra2.loc[:, 'org_length'] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide
import numpy
maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein name and get max length of each uniq protein
maxDT = maxDT.reset_index() ## reset the index
emp =[]
for q, w, in zip(maxDT.protein_rank, maxDT.templen):
z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1))
z = z.reset_index()
emp.append(z)
res = pd.concat(emp)
res.columns = ['postion', 'protein_rank']
res=res.reset_index()
res.drop('index', axis =1, inplace=True)
### make a db of every peptide in the ms file
res['total'] = 0
res['pep'] = 0
for i in range(0, len(spectra2.protein_rank)):
index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i])
index_pos2=res[res.postion.isin(index_pos)]
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i])))
#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1])
#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates()
#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)]
##############get the peptide with mods
for i in range(0, len(spectra2.peptide)):
spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2]
spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag]
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])]
modslen =[]
for i in range(0, len(spectra2.pep_with_tag)):
modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])])
mods_no =[]
for i in range(0, len(spectra2.pep_with_tag)):
mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i]))
offset_mod =[]
for mods in mods_no:
mods_temp = [len(i) for i in mods]
offset_mod.append(mods_temp)
res['mod'] = 0
n =0
d =0
test =[]
for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank):
if len(i) !=0 and i[0] >= 1:
n += 1
#print i, j, k, l, rank
ptms =[]
for s in range(len(i)):
if len(i) == 1:
ptms.append([i[s]])
break
elif len(i) == 2:
ptms.append([i[s], i[s+1]-j[s]])
break
elif len(i) == 3:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])])
break
elif len(i) == 4:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])])
break
elif len(i) == 5:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])])
break
elif len(i) == 6:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])])
break
elif len(i) == 7:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])])
break
ptms1 = list(chain.from_iterable(ptms))
ptms2 = np.array(ptms1)+l-1
print ptms2
test.append(ptms2.tolist())
index_mod=res[res.postion.isin(ptms2)]
res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1
#print np.array(ptms)[1:]+l-1
else:
d+=1
test.append(0)
spectra2['mod_pos'] = test ## add in the PTM positions into the main df
## merge the res with protein names
res2=res.merge(maxDT, on='protein_rank')
########## get the mutated pos if present
res2['mut_pos'] = 0
mut_pos =[]
for i in range(0, len(res2.protein_name)):
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
mut_pos.append(''.join(q))
new_list = [element or '0' for element in mut_pos]
res2.loc[:, ('mut_pos')]= new_list
res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int)
#
########## remove all zero occurence positions to reduce data
db1=res2.loc[res2['total'] !=0]
#### select only identified flu proteins
flu_mut=db1[db1.protein_name.str.contains('>HA-|>NP-|>NA-|>MA1-|>PB2-')]
flu_mut.reset_index(inplace =True, drop =True)
flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))]
flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))]
#### select the others proteins starting with >gi
non_mut = db1[db1.protein_name.str.contains('>gi')]
### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION
flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos]
flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values
flu_mut1.reset_index(inplace=True, drop =True)
flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank
flu_mut2 = flu_mut1.drop(['templen'], axis=1)
flu_mut2.reset_index(inplace=True, drop=True)
flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))]
flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))]
## get unmutated occurence for each mutation to calculate the ratios
for i in range(0, len(flu_mut2)):
flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])])
###take the non_mut wild type files and assign protein IDs
non_mut.loc[:, ('id1')] = 'NaN'
non_mut.reset_index(drop=True, inplace=True)
non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA"
non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP"
non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA"
non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2"
non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1"
## remove others that are not necessary
non_mut1=non_mut.loc[non_mut['id1'] != 'NaN']
non_mut1.reset_index(drop=True, inplace=True)
############################################################################## to match other wild type flu peptides
for i in range(0, len(flu_mut2.protein_name)):
for j in range(0, len(non_mut1.id1)):
if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]:
flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')]
flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence
flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name]
flu_mut2.to_csv('AFLSA208A_PAN_TRY_DB.csv', index=False)
### match to the database
flu_mut = pd.read_csv('AFLSA096AA_PAN_TRYP_May17_db.csv', header=0)
flu_mut['Description'] = [re.sub('>', '', flu_mut.Description[i]) for i in range(0, len(flu_mut.Description))]
## read in the database file
db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0)
db.drop(['Unnamed: 0'], axis=1, inplace=True)
db.columns=[re.sub('[.]', '_', db.columns.values[i]) for i in range(0, len(db.columns.values))]
########################################################################
for i in range(0, len(flu_mut.Description)):
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_M')] = flu_mut.total[i]
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_I')] = flu_mut.unmutated_occurence[i]
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_Rank')] = flu_mut.pep_rank[i]
db.loc[(db['Description'] == flu_mut.Description[i]), ('Ratio_mut_vs_unmut')] = flu_mut.Ratio_mut_vs_unmut[i]
################### boolean indexing
res2.loc[(res2['protein_rank'] == 4)]
n=0
for lines in spectra2.tag_rem_pep:
a=re.findall('KTSSWP*', lines)
n += 1
if len(a) > 0:
break
res2['mut_pos'] = 0
mut_pos =[]
for i in range(0, len(res2.protein_name)):
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
mut_pos.append(''.join(q))
new_list = [element or '0' for element in mut_pos]
res2.loc['mut_pos']= new_list
res2['mut_pos'] = res2['mut_pos'].astype(int)
res3=res2[res2['postion'] == res2.mut_pos]
res3.sort_values(['total'], ascending=False, inplace=True)
for i in maxDT.protein_name:
print re.findall('hemagglutinin', i)
res4=res3.loc[(res3['total']) !=0]
[1, 14, 24]
1 9 14 24 9
C[+71.037]DNTC[+71.037]M[+15.995]ESVK
12345678910
CDNTCMESVK
for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
if len(i) > 1 and i[0] >= 1:
print i, j, k
for nos in range(0, len(i)-1):
print i[nos],
print i[nos+1] - j[nos]
elif len(i) >2:
#print i[nos]
#print i[nos+1] - j[nos]
print i[nos+2] - np.sum(j[nos:nos+1])
print i[nos+2] - j[nos+nos+1]
for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
if len(i) > 1 and i[0] >= 1:
i = [6, 18, 27, 42]
j = [8, 8, 9, 9]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment