Skip to content

Instantly share code, notes, and snippets.

@adiamb
Last active May 18, 2017 05:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/1cc48369e070a70ecf310ab0074b50c0 to your computer and use it in GitHub Desktop.
Save adiamb/1cc48369e070a70ecf310ab0074b50c0 to your computer and use it in GitHub Desktop.
Mass_spec_AA
###PYTHON#############
#Purpose : Replication of MS spectra analysis
#Author : Aditya Ambati
#date : May 14 2017
#update : version 1
#####import libraries
from itertools import chain
import pandas as pd
import numpy as np
%matplotlib
import re
import numpy
def peponly(pepseq):
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
def process_raw_spectra(spectra, outfile):
spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']]
## rename the columns
spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name']
## define a function to return only alphabets
spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide
spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep]
untagged=[]
for i in range(len(spectra2.pep)):
line = spectra2.pep[i]
untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character
spectra2.loc[:, ('tag_rem_pep')] = untagged ## add this into the df
spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep]
spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int)
spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length
spectra2.loc[:, ('org_length')] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide
maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein name and get max length of each uniq protein
maxDT.reset_index(inplace=True) ## reset the index
emp =[]
for q, w, in zip(maxDT.protein_rank, maxDT.templen):
z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1))
z = z.reset_index()
emp.append(z)
res = pd.concat(emp)
res.columns = ['postion', 'protein_rank']
res.reset_index(drop = True, inplace=True)
print res
#res.drop('index', axis =1, inplace=True)
### make a db of every peptide in the ms file
res['total'] = 0
res['pep'] = 0
for i in range(0, len(spectra2.protein_rank)):
index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i])
index_pos2=res[res.postion.isin(index_pos)]
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i])))
#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1])
#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates()
#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)]
##############get the peptide with mods
for i in range(0, len(spectra2.peptide)):
spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2]
spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag]
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])]
modslen =[]
for i in range(0, len(spectra2.pep_with_tag)):
modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])])
mods_no =[]
for i in range(0, len(spectra2.pep_with_tag)):
mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i]))
offset_mod =[]
for mods in mods_no:
mods_temp = [len(i) for i in mods]
offset_mod.append(mods_temp)
res['mod'] = 0
n =0
d =0
test =[]
for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank):
if len(i) !=0 and i[0] >= 1:
n += 1
#print i, j, k, l, rank
ptms =[]
for s in range(len(i)):
if len(i) == 1:
ptms.append([i[s]])
break
elif len(i) == 2:
ptms.append([i[s], i[s+1]-j[s]])
break
elif len(i) == 3:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])])
break
elif len(i) == 4:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])])
break
elif len(i) == 5:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])])
break
elif len(i) == 6:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])])
break
elif len(i) == 7:
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])])
break
ptms1 = list(chain.from_iterable(ptms))
ptms2 = np.array(ptms1)+l-1
print ptms2
test.append(ptms2.tolist())
index_mod=res[res.postion.isin(ptms2)]
res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1
#print np.array(ptms)[1:]+l-1
else:
d+=1
test.append(0)
spectra2['mod_pos'] = test ## add in the PTM positions into the main df
## merge the res with protein names
res2=res.merge(maxDT, on='protein_rank')
########## get the mutated pos if present
res2['mut_pos'] = 0
mut_pos =[]
for i in range(0, len(res2.protein_name)):
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
mut_pos.append(''.join(q))
new_list = [element or '0' for element in mut_pos]
res2.loc[:, ('mut_pos')]= new_list
res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int)
#
########## remove all zero occurence positions to reduce data
db1=res2.loc[res2['total'] !=0]
#### select only identified flu proteins
flu_mut=db1[db1.protein_name.str.contains('>HA-|>NP-|>NA-|>MA1-|>PB2-')]
flu_mut.reset_index(inplace =True, drop =True)
flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))]
flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))]
#### select the others proteins starting with >gi
non_mut = db1[db1.protein_name.str.contains('>gi')]
### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION
flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos]
flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values
flu_mut1.reset_index(inplace=True, drop =True)
flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank
flu_mut2 = flu_mut1.drop(['templen'], axis=1)
flu_mut2.reset_index(inplace=True, drop=True)
flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))]
flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))]
## get unmutated occurence for each mutation to calculate the ratios
for i in range(0, len(flu_mut2)):
flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])])
###take the non_mut wild type files and assign protein IDs
non_mut.loc[:, ('id1')] = 'NaN'
non_mut.reset_index(drop=True, inplace=True)
non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA"
non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP"
non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA"
non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2"
non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1"
## remove others that are not necessary
non_mut1=non_mut.loc[non_mut['id1'] != 'NaN']
non_mut1.reset_index(drop=True, inplace=True)
############################################################################## to match other wild type flu peptides
for i in range(0, len(flu_mut2.protein_name)):
for j in range(0, len(non_mut1.id1)):
if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]:
flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')]
flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence
flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name]
flu_mut2.to_csv(outfile, index=False)
vacc_list = [i.replace('\n', '') for i in open('vacc_list', 'r')]
for files in vacc_list:
spectra = pd.read_csv(files, header = 0, sep = ";")
outfile = '_'.join(files.split('_')[0:2]) + '_TRYP_May17_db.csv'
process_raw_spectra(spectra, outfile)
print outfile
### make a db file containing all the analysis files
dblist = [i.replace('\n', '') for i in open('dblist', 'r')]
vacc_ids=[dblist[i].split('_')[1]+str(i+1) for i in range(0, len(dblist))]
vacc_mut=[]
for i in range(0, len(vacc_ids)):
tempfile = pd.read_csv(dblist[i], header=0)
tempfile['Description'] = [re.sub('>', '', tempfile.Description[i]) for i in range(0, len(tempfile.Description))]
vacc_mut.append(tempfile)
ARP1=vacc_mut[0]
ARP2=vacc_mut[1]
ARP3=vacc_mut[2]
ARP4=vacc_mut[3]
PAN5=vacc_mut[4]
PAN6=vacc_mut[5]
PAN7=vacc_mut[6]
PAN8=vacc_mut[7]
ARP9=vacc_mut[8]
dbscols =[]
for i in range(0, len(vacc_ids)):
db_ids = ((vacc_ids[i]+' ') *4).split(' ')[:4]
print db_ids
db_ids[0] = db_ids[0]+'_M'
db_ids[1] = db_ids[1]+'_I'
db_ids[2] = db_ids[2]+'_P_Rank'
db_ids[3] = db_ids[3]+'_Ratio_M_vs_unmut'
dbscols.extend(db_ids)
## read in the database file and modify and intialize
db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0)
database=db[['Description', 'Position','MutatedAA','InitialAA']]
or_cols=list(database.columns.values) ## extract existing DB columsn to re_index the DB frame with new colums
or_cols.extend(dbscols)
database=database.reindex(columns=or_cols) ## reassign index
########################################################################
for i in range(0, len(ARP1)):
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_M')] = ARP1.total[i]
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_I')] = ARP1.unmutated_occurence[i]
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_P_Rank')] = ARP1.pep_rank[i]
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_Ratio_M_vs_unmut')] = ARP1.Ratio_mut_vs_unmut[i]
for i in range(0, len(ARP2)):
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_M')] = ARP2.total[i]
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_I')] = ARP2.unmutated_occurence[i]
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_P_Rank')] = ARP2.pep_rank[i]
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_Ratio_M_vs_unmut')] = ARP2.Ratio_mut_vs_unmut[i]
for i in range(0, len(ARP3)):
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_M')] = ARP3.total[i]
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_I')] = ARP3.unmutated_occurence[i]
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_P_Rank')] = ARP3.pep_rank[i]
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_Ratio_M_vs_unmut')] = ARP3.Ratio_mut_vs_unmut[i]
for i in range(0, len(ARP4)):
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_M')] = ARP4.total[i]
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_I')] = ARP4.unmutated_occurence[i]
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_P_Rank')] = ARP4.pep_rank[i]
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_Ratio_M_vs_unmut')] = ARP4.Ratio_mut_vs_unmut[i]
for i in range(0, len(ARP9)):
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_M')] = ARP9.total[i]
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_I')] = ARP9.unmutated_occurence[i]
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_P_Rank')] = ARP9.pep_rank[i]
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_Ratio_M_vs_unmut')] = ARP9.Ratio_mut_vs_unmut[i]
for i in range(0, len(PAN5)):
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_M')] = PAN5.total[i]
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_I')] = PAN5.unmutated_occurence[i]
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_P_Rank')] = PAN5.pep_rank[i]
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_Ratio_M_vs_unmut')] = PAN5.Ratio_mut_vs_unmut[i]
for i in range(0, len(PAN6)):
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_M')] = PAN6.total[i]
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_I')] = PAN6.unmutated_occurence[i]
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_P_Rank')] = PAN6.pep_rank[i]
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_Ratio_M_vs_unmut')] = PAN6.Ratio_mut_vs_unmut[i]
for i in range(0, len(PAN7)):
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_M')] = PAN7.total[i]
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_I')] = PAN7.unmutated_occurence[i]
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_P_Rank')] = PAN7.pep_rank[i]
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_Ratio_M_vs_unmut')] = PAN7.Ratio_mut_vs_unmut[i]
for i in range(0, len(PAN8)):
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_M')] = PAN8.total[i]
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_I')] = PAN8.unmutated_occurence[i]
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_P_Rank')] = PAN8.pep_rank[i]
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_Ratio_M_vs_unmut')] = PAN8.Ratio_mut_vs_unmut[i]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment