Created
May 18, 2017 19:24
-
-
Save adiamb/86bbace828e5234a223280729cb14d08 to your computer and use it in GitHub Desktop.
MS_Single_Mutations_code_trial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###PYTHON############# | |
#Purpose : Replication of MS spectra analysis | |
#Author : Aditya Ambati | |
#date : May 14 2017 | |
#update : version 1 | |
#####import libraries | |
from itertools import chain | |
import pandas as pd | |
import numpy as np | |
%matplotlib | |
import re | |
spectra = pd.read_csv('AFLSA208A_PAN_TRYPSIN.csv', header = 0, sep = ";") ## read in the file | |
### take only selected columns | |
## | |
#array(['Query #:z', 'Protein\nRank','Peptide\n< ProteinMetrics Confidential >', 'Glycans\nNHFAGNa','Modification Type(s)', 'Observed\nm/z', 'z', 'Observed\n(M+H)', | |
#'Calc.\nmass (M+H)', 'Off-by-x\nerror', 'Mass error\n(ppm)', | |
#'Starting\nposition', 'Cleavage', 'Score', 'Delta', 'Delta\nMod', | |
#'|Log Prob|', '# of unique\npeptides', 'Protein Name', | |
# 'Protein\nDB number', 'Comment', 'Scan #', 'Scan Time'], dtype=object) | |
## subset the df | |
spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']] | |
## rename the columns | |
spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name'] | |
## define a function to return only alphabets | |
def peponly(pepseq): | |
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq)) | |
spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide | |
spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep] | |
untagged=[] | |
for i in range(len(spectra2.pep)): | |
line = spectra2.pep[i] | |
untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character | |
spectra2.loc[:, ('tag_rem_pep')] = untagged ## add this into the df | |
spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep] | |
spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int) | |
spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length | |
spectra2.loc[:, 'org_length'] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide | |
import numpy | |
maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein name and get max length of each uniq protein | |
maxDT = maxDT.reset_index() ## reset the index | |
emp =[] | |
for q, w, in zip(maxDT.protein_rank, maxDT.templen): | |
z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1)) | |
z = z.reset_index() | |
emp.append(z) | |
res = pd.concat(emp) | |
res.columns = ['postion', 'protein_rank'] | |
res=res.reset_index() | |
res.drop('index', axis =1, inplace=True) | |
### make a db of every peptide in the ms file | |
res['total'] = 0 | |
res['pep'] = 0 | |
for i in range(0, len(spectra2.protein_rank)): | |
index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i]) | |
index_pos2=res[res.postion.isin(index_pos)] | |
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1 | |
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i]))) | |
#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1]) | |
#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates() | |
#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]] | |
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)] | |
##############get the peptide with mods | |
for i in range(0, len(spectra2.peptide)): | |
spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2] | |
spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag] | |
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])] | |
modslen =[] | |
for i in range(0, len(spectra2.pep_with_tag)): | |
modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])]) | |
mods_no =[] | |
for i in range(0, len(spectra2.pep_with_tag)): | |
mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])) | |
offset_mod =[] | |
for mods in mods_no: | |
mods_temp = [len(i) for i in mods] | |
offset_mod.append(mods_temp) | |
res['mod'] = 0 | |
n =0 | |
d =0 | |
test =[] | |
for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank): | |
if len(i) !=0 and i[0] >= 1: | |
n += 1 | |
#print i, j, k, l, rank | |
ptms =[] | |
for s in range(len(i)): | |
if len(i) == 1: | |
ptms.append([i[s]]) | |
break | |
elif len(i) == 2: | |
ptms.append([i[s], i[s+1]-j[s]]) | |
break | |
elif len(i) == 3: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])]) | |
break | |
elif len(i) == 4: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])]) | |
break | |
elif len(i) == 5: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])]) | |
break | |
elif len(i) == 6: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])]) | |
break | |
elif len(i) == 7: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])]) | |
break | |
ptms1 = list(chain.from_iterable(ptms)) | |
ptms2 = np.array(ptms1)+l-1 | |
print ptms2 | |
test.append(ptms2.tolist()) | |
index_mod=res[res.postion.isin(ptms2)] | |
res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1 | |
#print np.array(ptms)[1:]+l-1 | |
else: | |
d+=1 | |
test.append(0) | |
spectra2['mod_pos'] = test ## add in the PTM positions into the main df | |
## merge the res with protein names | |
res2=res.merge(maxDT, on='protein_rank') | |
########## get the mutated pos if present | |
res2['mut_pos'] = 0 | |
mut_pos =[] | |
for i in range(0, len(res2.protein_name)): | |
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i]) | |
mut_pos.append(''.join(q)) | |
new_list = [element or '0' for element in mut_pos] | |
res2.loc[:, ('mut_pos')]= new_list | |
res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int) | |
# | |
########## remove all zero occurence positions to reduce data | |
db1=res2.loc[res2['total'] !=0] | |
#### select only identified flu proteins | |
flu_mut=db1[db1.protein_name.str.contains('>HA-|>NP-|>NA-|>MA1-|>PB2-')] | |
flu_mut.reset_index(inplace =True, drop =True) | |
flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))] | |
flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))] | |
#### select the others proteins starting with >gi | |
non_mut = db1[db1.protein_name.str.contains('>gi')] | |
### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION | |
flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos] | |
flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values | |
flu_mut1.reset_index(inplace=True, drop =True) | |
flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank | |
flu_mut2 = flu_mut1.drop(['templen'], axis=1) | |
flu_mut2.reset_index(inplace=True, drop=True) | |
flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))] | |
flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))] | |
## get unmutated occurence for each mutation to calculate the ratios | |
for i in range(0, len(flu_mut2)): | |
flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])]) | |
###take the non_mut wild type files and assign protein IDs | |
non_mut.loc[:, ('id1')] = 'NaN' | |
non_mut.reset_index(drop=True, inplace=True) | |
non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA" | |
non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP" | |
non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA" | |
non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2" | |
non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1" | |
## remove others that are not necessary | |
non_mut1=non_mut.loc[non_mut['id1'] != 'NaN'] | |
non_mut1.reset_index(drop=True, inplace=True) | |
############################################################################## to match other wild type flu peptides | |
for i in range(0, len(flu_mut2.protein_name)): | |
for j in range(0, len(non_mut1.id1)): | |
if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]: | |
flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')] | |
flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence | |
flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name] | |
flu_mut2.to_csv('AFLSA208A_PAN_TRY_DB.csv', index=False) | |
### match to the database | |
flu_mut = pd.read_csv('AFLSA096AA_PAN_TRYP_May17_db.csv', header=0) | |
flu_mut['Description'] = [re.sub('>', '', flu_mut.Description[i]) for i in range(0, len(flu_mut.Description))] | |
## read in the database file | |
db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0) | |
db.drop(['Unnamed: 0'], axis=1, inplace=True) | |
db.columns=[re.sub('[.]', '_', db.columns.values[i]) for i in range(0, len(db.columns.values))] | |
######################################################################## | |
for i in range(0, len(flu_mut.Description)): | |
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_M')] = flu_mut.total[i] | |
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_I')] = flu_mut.unmutated_occurence[i] | |
db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_Rank')] = flu_mut.pep_rank[i] | |
db.loc[(db['Description'] == flu_mut.Description[i]), ('Ratio_mut_vs_unmut')] = flu_mut.Ratio_mut_vs_unmut[i] | |
################### boolean indexing | |
res2.loc[(res2['protein_rank'] == 4)] | |
n=0 | |
for lines in spectra2.tag_rem_pep: | |
a=re.findall('KTSSWP*', lines) | |
n += 1 | |
if len(a) > 0: | |
break | |
res2['mut_pos'] = 0 | |
mut_pos =[] | |
for i in range(0, len(res2.protein_name)): | |
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i]) | |
mut_pos.append(''.join(q)) | |
new_list = [element or '0' for element in mut_pos] | |
res2.loc['mut_pos']= new_list | |
res2['mut_pos'] = res2['mut_pos'].astype(int) | |
res3=res2[res2['postion'] == res2.mut_pos] | |
res3.sort_values(['total'], ascending=False, inplace=True) | |
for i in maxDT.protein_name: | |
print re.findall('hemagglutinin', i) | |
res4=res3.loc[(res3['total']) !=0] | |
[1, 14, 24] | |
1 9 14 24 9 | |
C[+71.037]DNTC[+71.037]M[+15.995]ESVK | |
12345678910 | |
CDNTCMESVK | |
for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag): | |
if len(i) > 1 and i[0] >= 1: | |
print i, j, k | |
for nos in range(0, len(i)-1): | |
print i[nos], | |
print i[nos+1] - j[nos] | |
elif len(i) >2: | |
#print i[nos] | |
#print i[nos+1] - j[nos] | |
print i[nos+2] - np.sum(j[nos:nos+1]) | |
print i[nos+2] - j[nos+nos+1] | |
for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag): | |
if len(i) > 1 and i[0] >= 1: | |
i = [6, 18, 27, 42] | |
j = [8, 8, 9, 9] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment