Skip to content

Instantly share code, notes, and snippets.

@heliy
Last active December 31, 2015 00:38
Show Gist options
  • Save heliy/7908332 to your computer and use it in GitHub Desktop.
Save heliy/7908332 to your computer and use it in GitHub Desktop.
查找长序列里剪切识别位点
#coding:UTF-8
#!/usr/bin/python
#使用方法:
#python seqcutloc.py fasta文件 序列 输出文件
#./seqcutloc.py fasta文件 序列 输出文件
#算法是通过将长序列按碱基切开 在拼接回原序列
#eg seq:ATAATCGGCGATTAATCGAAT cut:AAT
#切完后得到子序列 AT CGGCGATT CG 长度为2 8 2
#拼接得到 第一个位点:2+1=3 第二个位点:2+3+8+1=14 第三个位点:2+3+8+3+2+1=19
import string
import sys
# fasta文件类
class fasta(object):
def __init__(self,head,seq):
self.head=head
self.seq=seq
#处理单个fasta文件数据
def sinfasta(sfas,target):
targloc=[]
name=sfas.head.split('|')[3]
sub=sfas.seq.split(target)
if len(sub)<1:
return #没有╮(╯_╰)╭
i=1 #找到的序号
before=1 #之前的位置标记
tarlen=len(target)
for s in sub[:-1]: #最末尾的不用考虑
before+=len(s) #该切点开始位置
targloc.append([name,str(i),str(before),str(before+tarlen)])
before+=tarlen #该切点结束位置
i+=1
return targloc
#将fna文件中的单个fasta分开
def file2singles(filename):
fl=[]
f=open(filename,'r')
fastalist=f.read().split('>')
f.close()
for fas in fastalist:
if len(fas)<10:
continue
head=fas.split("\n")[0]
seq=fas.replace(head,'').replace('\n','')
fa=fasta(head,seq)
fl.append(fa)
return fl
#主处理过程
def process(filename,target,tofile):
w=open(tofile,'w+')
falist=file2singles(filename)
for fa in falist:
result=sinfasta(fa,target)
lines=[string.join(item,'\t')+'\n' for item in result]
w.writelines(lines)
w.write('\n')
w.close()
return
if __name__=="__main__":
[filename,target,tofile]=sys.argv[1:]
process(filename,target,tofile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment