adiamb/COUNTS_PROTEIN_CONTENT_Oct13.py

## COUNTS_PROTEIN_CONTENT_Oct13.py
import sys
import time
import subprocess
from subprocess import PIPE
import re
from itertools import chain
import pandas as pd
import numpy as np
import copy
## import filenames as a list
f_names=['ar_chy_list','ar_try_list', 'pan_chy_list', 'pan_try_list']

## make a tmeplate dictionary for all possible proteins found
temp_dic ={}
for file in f_names:
	with open(file) as f_:
		for item in f_:
			filelist[item.strip('\n')] = 1
			with open(item.strip('\n')) as ms_in:
				n =0
				for line in ms_in:
					parse_line = line.strip('\n')
					n += 1
					if n > 13:
						parse_line = line.split(';')
						if not re.search(r'(>Reverse|Common contaminant protein)', str(parse_line[18])): ### remove contaminants
							if re.search(r'(>PB2-|>HA-|>NA-|>MA1-|>NP-)', str(parse_line[18])):
								makekey = parse_line[18].split('-')[0]
							else:
								makekey = str(parse_line[18])

							if makekey not in temp_dic:
								temp_dic[makekey] = 0


protein_dic=copy.deepcopy(temp_dic) ### make a copy of the protein_dic
## make a dic with files
main_counts={}
for file in f_names:
	with open(file) as f_:
		for item in f_:
			parse_item = item.strip('\r\n')
			with open(parse_item) as ms_in:
				n =0
				for line in ms_in:
					parse_line = line.strip('\n')
					n += 1
					if n > 13:
						parse_line = line.split(';')
						if not re.search(r'(>Reverse|Common contaminant protein)', str(parse_line[18])): ### remove contaminants
							if re.search(r'(>PB2-|>HA-|>NA-|>MA1-|>NP-)', str(parse_line[18])):
								makekey = parse_line[18].split('-')[0]
							else:
								makekey = str(parse_line[18])
							print makekey

							if parse_item in main_counts:
								get_exist_item = main_counts.get(parse_item)
								if makekey in get_exist_item:
									get_exist_count = get_exist_item.get(makekey)
									get_exist_item[makekey]= int(get_exist_count)+1
								else:
									get_exist_item[makekey] = 1
							else:
								main_counts[parse_item] = copy.deepcopy(protein_dic)

file_headers = ';'.join(main_counts.keys()) ### make file headers

final_dic ={}
for key, value in main_counts.iteritems():
	for key2, value2 in value.iteritems():
		if key2 in final_dic:
			get_count = final_dic.get(key2)
			final_dic[key2] = str(get_count)+';'+str(value2)
		else:
			final_dic[key2] = value2


with open('COUNTS_PROTEIN_CONTENT_Oct13.csv', 'w') as f_out:
	f_out.write('PROTEIN'+';'+file_headers+'\n')
	for key, value in final_dic.iteritems():
		f_out.write(key+';'+value+'\n')


if 'Reverse' or 'Common contaminant protein' in t1:
	print 'yeay'

''.join(re.findall('\..*\.', line.split(';')[2]))
re.sub(pattern='[^A-Z]*',repl='', string=t1)


0 Query
1 ProteinRank
2 Peptide
3 GlycansNHFAGNa
4 Modification
5 Observed
6 z
7 Observed
8 Calc.mass
9 Off-by-xerror
10 Mass error(ppm)
11 Startingposition
12 Cleavage
13 Score
14 Delta
15 DeltaMod
16 |Log Prob|
17 # of uniquepeptides
18 Protein Name
19 ProteinDB number
20 Comment
21 Scan #
22 Scan Time

headers=["Query",
"ProteinRank",
"Peptide",
"GlycansNHFAGNa",
"Modification",
"Observed",
"z",
"Observed",
"Calc.mass",
"Off-by-xerror",
"Mass error(ppm)",
"Startingposition",
"Cleavage",
"Score",
"Delta",
"DeltaMod",
"|Log Prob|",
"# of uniquepeptides",
"Protein Name",
"ProteinDB number",
"Comment",
"Scan #",
"Scan Time"]
	import sys
	import time
	import subprocess
	from subprocess import PIPE
	import re
	from itertools import chain
	import pandas as pd
	import numpy as np
	import copy
	## import filenames as a list
	f_names=['ar_chy_list','ar_try_list', 'pan_chy_list', 'pan_try_list']

	## make a tmeplate dictionary for all possible proteins found
	temp_dic ={}
	for file in f_names:
	with open(file) as f_:
	for item in f_:
	filelist[item.strip('\n')] = 1
	with open(item.strip('\n')) as ms_in:
	n =0
	for line in ms_in:
	parse_line = line.strip('\n')
	n += 1
	if n > 13:
	parse_line = line.split(';')
	if not re.search(r'(>Reverse\|Common contaminant protein)', str(parse_line[18])): ### remove contaminants
	if re.search(r'(>PB2-\|>HA-\|>NA-\|>MA1-\|>NP-)', str(parse_line[18])):
	makekey = parse_line[18].split('-')[0]
	else:
	makekey = str(parse_line[18])

	if makekey not in temp_dic:
	temp_dic[makekey] = 0


	protein_dic=copy.deepcopy(temp_dic) ### make a copy of the protein_dic
	## make a dic with files
	main_counts={}
	for file in f_names:
	with open(file) as f_:
	for item in f_:
	parse_item = item.strip('\r\n')
	with open(parse_item) as ms_in:
	n =0
	for line in ms_in:
	parse_line = line.strip('\n')
	n += 1
	if n > 13:
	parse_line = line.split(';')
	if not re.search(r'(>Reverse\|Common contaminant protein)', str(parse_line[18])): ### remove contaminants
	if re.search(r'(>PB2-\|>HA-\|>NA-\|>MA1-\|>NP-)', str(parse_line[18])):
	makekey = parse_line[18].split('-')[0]
	else:
	makekey = str(parse_line[18])
	print makekey

	if parse_item in main_counts:
	get_exist_item = main_counts.get(parse_item)
	if makekey in get_exist_item:
	get_exist_count = get_exist_item.get(makekey)
	get_exist_item[makekey]= int(get_exist_count)+1
	else:
	get_exist_item[makekey] = 1
	else:
	main_counts[parse_item] = copy.deepcopy(protein_dic)

	file_headers = ';'.join(main_counts.keys()) ### make file headers

	final_dic ={}
	for key, value in main_counts.iteritems():
	for key2, value2 in value.iteritems():
	if key2 in final_dic:
	get_count = final_dic.get(key2)
	final_dic[key2] = str(get_count)+';'+str(value2)
	else:
	final_dic[key2] = value2


	with open('COUNTS_PROTEIN_CONTENT_Oct13.csv', 'w') as f_out:
	f_out.write('PROTEIN'+';'+file_headers+'\n')
	for key, value in final_dic.iteritems():
	f_out.write(key+';'+value+'\n')



	if 'Reverse' or 'Common contaminant protein' in t1:
	print 'yeay'

	''.join(re.findall('\..*\.', line.split(';')[2]))
	re.sub(pattern='[^A-Z]*',repl='', string=t1)



	0 Query
	1 ProteinRank
	2 Peptide
	3 GlycansNHFAGNa
	4 Modification
	5 Observed
	6 z
	7 Observed
	8 Calc.mass
	9 Off-by-xerror
	10 Mass error(ppm)
	11 Startingposition
	12 Cleavage
	13 Score
	14 Delta
	15 DeltaMod
	16 \|Log Prob\|
	17 # of uniquepeptides
	18 Protein Name
	19 ProteinDB number
	20 Comment
	21 Scan #
	22 Scan Time

	headers=["Query",
	"ProteinRank",
	"Peptide",
	"GlycansNHFAGNa",
	"Modification",
	"Observed",
	"z",
	"Observed",
	"Calc.mass",
	"Off-by-xerror",
	"Mass error(ppm)",
	"Startingposition",
	"Cleavage",
	"Score",
	"Delta",
	"DeltaMod",
	"\|Log Prob\|",
	"# of uniquepeptides",
	"Protein Name",
	"ProteinDB number",
	"Comment",
	"Scan #",
	"Scan Time"]