Created
June 30, 2017 17:38
-
-
Save adiamb/9c6b821b3694226ce9afc7ec2ae5392a to your computer and use it in GitHub Desktop.
Frameshift python script june 30 2017
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###PYTHON############# | |
#Purpose : Analyze frameshifts in the MS-tryspin/chemotrypsin digests | |
#Author : Aditya Ambati | |
#date : June7 2017 | |
#update : version 1 | |
#####import libraries | |
from itertools import chain | |
import pandas as pd | |
import numpy as np | |
%matplotlib | |
import re | |
from Bio import Entrez | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Alphabet import IUPAC | |
################################################ define helper function ############################################ | |
################ this function will take a list with filenames and return a MS output spectra file with trimmed columns and calculated peptide length | |
def peponly(pepseq): | |
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq)) | |
def spectra_processing(spectra): | |
dfs =[] | |
for i in range(0, len(spectra)): | |
temp = spectra[i]#pd.read_csv(spectra[i], header = 0, sep = ";") | |
temp1 = temp[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name', '|Log Prob|', 'file_id']] | |
## rename the columns | |
temp1.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name', 'log_prob', 'file_id'] | |
temp2=temp1[~temp1['protein_name'].str.contains('Reverse')] | |
temp2.reset_index(drop=True, inplace=True) | |
#temp2.loc[:, ('filename')] = spectra[i] | |
temp2.loc[:, ('pep_wt_tags')]=[re.findall('\..*\.', i) for i in temp2.peptide] ## take only characters between the first and the last dot character | |
temp2.loc[:, ('tag_rem_pep')] = [peponly(i) for i in temp2.pep_wt_tags] ### assign to a new column the raw peptide without tags | |
temp2.loc[:, ('tag_rm_len')] = [len(i) for i in temp2.tag_rem_pep] ## get the raw peptide length | |
temp2.loc[:, ('templen')] = temp2.tag_rm_len + temp2.start_pos ### add the startingpos + untagged peptide length | |
#temp2.loc[:, 'org_length'] = [len(i) for i in temp2.peptide] | |
dfs.append(temp2) | |
return(dfs) | |
##### read in the 1 shift file | |
file_list =[] ## make a file list | |
for line in open('vacc_names_batches_matched', 'r'): | |
t1=line.replace('\n', '') | |
filename=t1.split('\t')[0] | |
vaccine = t1.split('\t')[1] | |
batch = t1.split('\t')[2] | |
out = [filename, vaccine, batch] | |
file_list.append(out) | |
filelist=pd.DataFrame(file_list, columns=['file', 'vacc', 'batch']) | |
filelist.loc[:, ('file_id')]=filelist.vacc +'_TRY_'+ filelist.batch ## add in an extra colum sepcifying fileID | |
shiftdfs=[pd.read_excel(i, sheetname='Spectra') for i in filelist.file] ## read in the spectra files in a loop. | |
shiftdf1 = shiftdfs[:] | |
for i in range(0, len(shiftdfs)): #### add in the file and lot ID into a column | |
shiftdfs[i].loc[:, ('file_id')] = filelist.file_id[i] | |
############# call for spectra processing fucntion | |
shiftdf_proc = spectra_processing(shiftdf1) | |
all_1shift=pd.concat(shiftdf_proc, ignore_index=True) | |
onlyflu=all_1shift[all_1shift.protein_name.str.contains('Influenza')] | |
onlyflu.reset_index(drop=True, inplace=True) | |
onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name] | |
################## read in the reference files needed to match the frameshifts | |
reference_ids = pd.read_csv('/media/labcomp/HDD/otherprojects/MS_Analysis/Frameshift/Frameshift_list_april18.csv', header=0) | |
reference_ids.columns=[re.sub(' ', '', i) for i in reference_ids.columns.values] | |
reference_gb = reference_ids.genbankID.tolist() | |
reference_pbid = reference_ids.proteinID.tolist() | |
reference_fasta = [] | |
for i in reference_pbid: | |
Entrez.email = "ambati@stanford.edu" | |
handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=i) ## make a request to the ncbi | |
seq_record = SeqIO.read(handle, "fasta") | |
handle.close() | |
reference_fasta.append(seq_record) | |
#onlyflu=shiftdf1[shiftdf1.protein_name.str.contains('Influenza')] | |
#onlyflu.reset_index(drop=True, inplace=True) | |
#onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name] | |
#onlyflumerge=pd.merge(onlyflu, reference_ids, left_on='genbankID', right_on='genbankID', how='inner') | |
##################### make a df of proteinID and the sequences asscoiated with them | |
db_dict ={} | |
for i in range(0, len(reference_fasta)): | |
seqid = reference_fasta[i].id | |
fastaseq = reference_fasta[i].seq.tostring() | |
db_dict[seqid] = fastaseq | |
ref_df_prots=pd.DataFrame.from_dict(db_dict.items()) | |
ref_df_prots.columns = ['PID', 'SEQ'] | |
########################## take only unique peptide to be tested against reference sequences | |
unique_peps=onlyflu.groupby(['tag_rem_pep', 'genbankID']).size() | |
uniques_df=pd.DataFrame(unique_peps) | |
uniques_df.reset_index(inplace=True) | |
### get a list of index where the peptide dont match | |
checkindexes=[] | |
for i in range(0, len(uniques_df.tag_rem_pep)): | |
testx=ref_df_prots[ref_df_prots.SEQ.str.contains(uniques_df.tag_rem_pep[i])] | |
if testx.shape[0] == 0: | |
checkindexes.append(i) | |
unmatchedpep=uniques_df.ix[checkindexes] | |
unmatchedpep.reset_index(drop=True, inplace=True) | |
#### merge the proteinids and genbank ids | |
unmatchedpeps_df=pd.merge(unmatchedpep, reference_ids, left_on='genbankID', right_on='genbankID', how='inner') | |
unmatchedpeps_df.loc[:, ('blast_out')] = 'NaN' | |
checkfastas=[] | |
for i in range(0, len(unmatchedpeps_df.tag_rem_pep)): | |
tempseq=SeqRecord(Seq(unmatchedpeps_df.tag_rem_pep[i], IUPAC.protein), id = unmatchedpeps_df.proteinID[i], name='index'+str(unmatchedpeps_df.strainname[i])) | |
checkfastas.append(tempseq) | |
#SeqIO.write(tempseq, '/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/' + tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta', 'fasta') | |
fasta_x=tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta' | |
#SeqIO.write(checkfastas, 'check.fasta', 'fasta') | |
#blast_hit =[] | |
#with open('fasta_list', 'r') as f: | |
#for line in f: | |
#fasta_x = line.strip() | |
#BLASTN_command_C = NcbiblastpCommandline(query='/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/ADE29085.1_467.fasta',db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1) | |
#stdout_C, stderr_C = BLASTN_command_C() | |
BLASTN_command_C = NcbiblastpCommandline(query=fasta_x,db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1) | |
stdout_C, stderr_C = BLASTN_command_C() | |
print stdout_C | |
#blast_hit.append(stdout_C) | |
unmatchedpeps_df.loc[i, ('blast_out')] = stdout_C | |
pot_frames=unmatchedpeps_df[unmatchedpeps_df.blast_out == ''] | |
pot_frames=pot_frames.sort_values([0], ascending=False) | |
pot_frames.reset_index(drop=True, inplace=True) | |
frameshifted_1=pot_frames.ix[:11, ('tag_rem_pep')] # 3 or more | |
### reach into main df and pull these frames out | |
frames_1=onlyflu[onlyflu.tag_rem_pep.isin(frameshifted_1)] | |
frames_1[['tag_rem_pep', 'file_id']].groupby(['tag_rem_pep', 'file_id']).size() | |
###################################################################### Debugging ##################################################################################################### | |
########################### prepare for local blast make a fasta list | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Alphabet import IUPAC | |
blastlist = [line.replace('\n', '') for line in open('blast_list', 'r')] | |
blastlist.remove('my_blast.xml') ### remove a test file from the list | |
n = 0 | |
for files in blastlist: | |
for record in NCBIXML.parse(open(files)): | |
if record.alignments: | |
#print "QUERY: %s" % record.query[:60] | |
for align in record.alignments: | |
for hsp in align.hsps: | |
if hsp.expect > 0.5: | |
n += 1 | |
print "MATCH: %s " % align.title[:60] | |
print hsp.expect | |
print hsp.query, hsp.identities | |
checkpepsindex =[] | |
n = 0 | |
for files in blastlist: | |
for record in NCBIXML.parse(open(files)): | |
if len(record.alignments) == 0: | |
n += 1 | |
print record.query, files | |
checkpepsindex.append(files.split('.')[1]) | |
#print "QUERY: %s" % record.query[:60] | |
for align in record.alignments: | |
for hsp in align.hsps: | |
print hsp.expect | |
n += 1 | |
if hsp.expect < 1e-10: | |
n += 1 | |
print "MATCH: %s " % align.title[:60] | |
print hsp.expect | |
pep_frame_index=[int(i.split('_')[1]) for i in checkpepsindex] | |
frameshifts=unmatchedpeps_df.ix[pep_frame_index] | |
frameshifts.sort_values([0], ascending=False) | |
SeqIO.write(reference_fasta, '/home/labcomp/Desktop/frames_reference.fasta', 'fasta') | |
filename='fasta_list' | |
filelines=`cat $filename` | |
for line in $filelines ; do | |
#blastp -out $line.xml -outfmt 5 -query $line -db "nr" -entrez_query "Homo Sapiens[ORGN]" -remote | |
blastp -out $line.xml -outfmt 5 -query $line -db ~/frames_DB -num_threads 8 | |
#echo $line | |
done | |
selectedhu.loc[selectedhu.tag_rem_pep.str.contains('ALTLLEDWCKGMDMDPR'), ('filename')].value_counts() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment