Skip to content

Instantly share code, notes, and snippets.

@goldengrape
Created November 11, 2017 09:36
Show Gist options
  • Save goldengrape/f5e4cf9f7bf1d9d101a9808aaed98373 to your computer and use it in GitHub Desktop.
Save goldengrape/f5e4cf9f7bf1d9d101a9808aaed98373 to your computer and use it in GitHub Desktop.
# coding: utf-8
# # 读取并整理txt
# # 读取txt
# In[15]:
import re
import os
# import pandas as pd
# from pandas import DataFrame,Series
# from Bio.SeqUtils import seq1
# ## 指定文件路径和文件名
# In[16]:
if __name__=="__main__":
input_path='SeqCheck'
fname='test.txt'
output_path="SeqCleaned"
# ## 打开文件, 读入文件
#
# In[17]:
def read_content_fromTXT(input_path,fname):
filename=os.path.join(input_path,fname)
txtfile = open(filename,'r')
sContent=txtfile.read()
txtfile.close()
return sContent
# In[40]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# # 转换成JSON
# ## 增加标签
# 使每一个条目都统一成为
# ```
# <标签名> 内容
# ```
#
# * 在SEQUENCE LISTING前增加标签000
# * 在400标签之后增加序列标签
#
# In[41]:
def add_000_to_title(sContent):
sContent="<000>"+sContent
return sContent
# In[58]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
#print(sContent)
# In[60]:
def add_tag_after_tag(sContent,add_tag,target_tag):
# target_tag="<400>"
# add_tag="<seq>"
target=target_tag+".*\n"
words=re.findall(target,sContent)
for w in words:
sContent=re.sub(w,w+add_tag,sContent)
return(sContent)
# In[62]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
#print(sContent)
# ## 替换所有的换行
# 需要在增加了seq标签以后进行
# In[82]:
def sub_all_return(sContent):
sContent=re.sub("\n","",sContent)
# sContent=re.sub(","," ",sContent) # 去掉原有的逗号
sContent=re.sub("<",",\n<",sContent)
sContent=sContent[2:] # 去掉首行多余的东西
return sContent
# In[84]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
#print(sContent)
# ## 以210分界, 作为对象加大括号
# In[95]:
def add_to_obj(sContent):
sContent=re.sub('<210>','},\n{\n<210>',sContent)
sContent='{\n'+sContent+'\n}'
return sContent
# In[96]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
# 以210分界, 作为对象加大括号
sContent= add_to_obj(sContent)
print(sContent)
# ## 加入方括号成为键值数组
#
# * 在首次出现2xx的标签时, 加入左反括号[
# * 在文件末尾加入右方括号]
# In[100]:
def add_square(sContent):
# loc=re.search("<2\d*>",sContent).start()
# sContent=sContent[:loc]+"["+sContent[loc:]
# sContent =sContent+']'
sContent='['+sContent+']'
return sContent
# In[101]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
# 以210分界, 作为对象加大括号
sContent= add_to_obj(sContent)
#加入方括号成为键值数组
sContent=add_square(sContent)
# print(sContent)
# ## 重整所有的标签
# In[131]:
def sub_all_tag(sContent):
sContent=re.sub('<','"<',sContent)
sContent=re.sub('>','>" : " ',sContent)
sContent=re.sub(',\n','",\n',sContent)
sContent=re.sub(',\n}",','\n},',sContent)
sContent=re.sub('\n}]','"\n}]',sContent)
return sContent
# In[132]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
# 以210分界, 作为对象加大括号
sContent= add_to_obj(sContent)
# 加入方括号成为键值数组
sContent=add_square(sContent)
# 重整所有的标签
sContent=sub_all_tag(sContent)
print(sContent)
# # 保存成JSON
# In[124]:
def write_content_toJSON(output_path,fname,content):
fname, ext = os.path.splitext(fname)
output_filename=os.path.join(output_path,fname+'.json')
jsonfile = open(output_filename,'w')
jsonfile.write(content)
jsonfile.close()
return output_filename
# In[125]:
if __name__=="__main__":
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
# 以210分界, 作为对象加大括号
sContent= add_to_obj(sContent)
# 加入方括号成为键值数组
sContent=add_square(sContent)
# 重整所有的标签
sContent=sub_all_tag(sContent)
# 保存为json
output_filename=write_content_toJSON(output_path,fname,sContent)
# # 整合封装
# In[126]:
def patentIn_to_json(input_path,fname,output_path):
# 打开并读入文件
sContent=read_content_fromTXT(input_path,fname)
# 在SEQUENCE LISTING前增加标签000
sContent= add_000_to_title(sContent)
# 在400标签下一行增加标签seq
sContent=add_tag_after_tag(sContent,"<seq>","<400>")
# 替换所有的换行
sContent=sub_all_return(sContent)
# 以210分界, 作为对象加大括号
sContent= add_to_obj(sContent)
# 加入方括号成为键值数组
sContent=add_square(sContent)
# 重整所有的标签
sContent=sub_all_tag(sContent)
# 保存为json
output_filename=write_content_toJSON(output_path,fname,sContent)
return output_filename
# In[ ]:
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment