Created
October 14, 2017 01:26
-
-
Save adiamb/971433fe5ddfa4d37ad43b969945ddbc to your computer and use it in GitHub Desktop.
update protein counts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import subprocess | |
from subprocess import PIPE | |
import re | |
from itertools import chain | |
import pandas as pd | |
import numpy as np | |
import copy | |
## import filenames as a list | |
f_names=['ar_chy_list','ar_try_list', 'pan_chy_list', 'pan_try_list'] | |
## make a tmeplate dictionary for all possible proteins found | |
temp_dic ={} | |
for file in f_names: | |
with open(file) as f_: | |
for item in f_: | |
filelist[item.strip('\n')] = 1 | |
with open(item.strip('\n')) as ms_in: | |
n =0 | |
for line in ms_in: | |
parse_line = line.strip('\n') | |
n += 1 | |
if n > 13: | |
parse_line = line.split(';') | |
if not re.search(r'(>Reverse|Common contaminant protein)', str(parse_line[18])): ### remove contaminants | |
if re.search(r'(>PB2-|>HA-|>NA-|>MA1-|>NP-)', str(parse_line[18])): | |
makekey = parse_line[18].split('-')[0] | |
else: | |
makekey = str(parse_line[18]) | |
if makekey not in temp_dic: | |
temp_dic[makekey] = 0 | |
protein_dic=copy.deepcopy(temp_dic) ### make a copy of the protein_dic | |
## make a dic with files | |
main_counts={} | |
for file in f_names: | |
with open(file) as f_: | |
for item in f_: | |
parse_item = item.strip('\r\n') | |
with open(parse_item) as ms_in: | |
n =0 | |
for line in ms_in: | |
parse_line = line.strip('\n') | |
n += 1 | |
if n > 13: | |
parse_line = line.split(';') | |
if not re.search(r'(>Reverse|Common contaminant protein)', str(parse_line[18])): ### remove contaminants | |
if re.search(r'(>PB2-|>HA-|>NA-|>MA1-|>NP-)', str(parse_line[18])): | |
makekey = parse_line[18].split('-')[0] | |
else: | |
makekey = str(parse_line[18]) | |
print makekey | |
if parse_item in main_counts: | |
get_exist_item = main_counts.get(parse_item) | |
if makekey in get_exist_item: | |
get_exist_count = get_exist_item.get(makekey) | |
get_exist_item[makekey]= int(get_exist_count)+1 | |
else: | |
get_exist_item[makekey] = 1 | |
else: | |
main_counts[parse_item] = copy.deepcopy(protein_dic) | |
file_headers = ';'.join(main_counts.keys()) ### make file headers | |
final_dic ={} | |
for key, value in main_counts.iteritems(): | |
for key2, value2 in value.iteritems(): | |
if key2 in final_dic: | |
get_count = final_dic.get(key2) | |
final_dic[key2] = str(get_count)+';'+str(value2) | |
else: | |
final_dic[key2] = value2 | |
with open('COUNTS_PROTEIN_CONTENT_Oct13.csv', 'w') as f_out: | |
f_out.write('PROTEIN'+';'+file_headers+'\n') | |
for key, value in final_dic.iteritems(): | |
f_out.write(key+';'+value+'\n') | |
if 'Reverse' or 'Common contaminant protein' in t1: | |
print 'yeay' | |
''.join(re.findall('\..*\.', line.split(';')[2])) | |
re.sub(pattern='[^A-Z]*',repl='', string=t1) | |
0 Query | |
1 ProteinRank | |
2 Peptide | |
3 GlycansNHFAGNa | |
4 Modification | |
5 Observed | |
6 z | |
7 Observed | |
8 Calc.mass | |
9 Off-by-xerror | |
10 Mass error(ppm) | |
11 Startingposition | |
12 Cleavage | |
13 Score | |
14 Delta | |
15 DeltaMod | |
16 |Log Prob| | |
17 # of uniquepeptides | |
18 Protein Name | |
19 ProteinDB number | |
20 Comment | |
21 Scan # | |
22 Scan Time | |
headers=["Query", | |
"ProteinRank", | |
"Peptide", | |
"GlycansNHFAGNa", | |
"Modification", | |
"Observed", | |
"z", | |
"Observed", | |
"Calc.mass", | |
"Off-by-xerror", | |
"Mass error(ppm)", | |
"Startingposition", | |
"Cleavage", | |
"Score", | |
"Delta", | |
"DeltaMod", | |
"|Log Prob|", | |
"# of uniquepeptides", | |
"Protein Name", | |
"ProteinDB number", | |
"Comment", | |
"Scan #", | |
"Scan Time"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment