Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script for merging optical map xml files (for different scaffolds) into one large xml file
import sys
import os
from xml.etree import ElementTree as ET
import AvrilFileUtils
class Error (Exception): pass
#====================================================================#
# define a function to merge optical map xml files for different scaffolds.
def merge_optical_map_xml_files(input_file_list, output_xml_file):
"""Merge the optical map xml files in a directory into one output xml file"""
# get a list of all the input files:
filenames = []
fileObj = open(input_file_list, "r")
for line in fileObj:
line = line.rstrip()
temp = line.split()
filename = temp[0]
filenames.append(filename)
fileObj.close()
# combine all the xml files:
# copied code from https://gist.github.com/cgoldberg/4320815:
seen_experiment = False
cases = []
for filename in filenames:
print("reading filename %s" % filename)
tree = ET.parse(filename)
root = tree.getroot() # 'aligned_maps_document'
for child in root:
if child.tag == 'experiment':
if seen_experiment == True:
root.remove(child)
else:
# keep one 'experiment' child. Note: I think this is necessary to write out the data with ElementTree
seen_experiment = True
cases.append(root.getchildren())
# write out the new tree:
new_root = ET.Element('aligned_maps_document')
for case in cases:
new_root.extend(case)
new_tree = ET.ElementTree(new_root)
# see comments at http://stackoverflow.com/questions/12612648/python-element-tree-is-removing-the-xml-declaration
new_tree.write(output_xml_file, encoding="iso-8859-1", xml_declaration=True, method="xml")
# open the output xml file, so we can append a new line character (for some reason the
# ElementTree object does not seem to end the file with a new line character):
outputfileObj = open(output_xml_file, "a")
outputfileObj.write('\n')
outputfileObj.close()
return
#====================================================================#
# define a function to rewrite the header of 'output_xml_file', so that
# it can be read by MapSolver
def rewrite_optical_map_xml_header(temp_output_xml_file, output_xml_file):
"""rewrite the header of the optical map xml file, so that it can be read by MapSolver"""
# open the output file:
outputfileObj = open(output_xml_file, "w")
# write the header in the output file:
outputfileObj.write('<?xml version="1.0" encoding="iso-8859-1"?>\n')
outputfileObj.write('<!-- @version:0.1 -->\n\n')
outputfileObj.write('<!-- this is a comment -->\n')
outputfileObj.write('<aligned_maps_document version="0.5">\n\n')
outputfileObj.write('<experiment>\n')
# read in temp_output_xml_file:
found_start = False
fileObj = open(temp_output_xml_file, "r")
for line in fileObj:
if '<creator>gentig2</creator>' in line:
found_start = True
if found_start == True:
outputfileObj.write(line)
fileObj.close()
outputfileObj.close()
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s input_file_list output_xml_file" % sys.argv[0])
sys.exit(1)
input_file_list = sys.argv[1]
output_xml_file = sys.argv[2]
# merge the optical map xml files for different scaffolds:
temp_output_xml_file = AvrilFileUtils.make_filename(os.getcwd())
merge_optical_map_xml_files(input_file_list, temp_output_xml_file)
# rewrite the header of the 'output_xml_file' file so that it can be read by MapSolver:
rewrite_optical_map_xml_header(temp_output_xml_file, output_xml_file)
# delete temporary files:
os.unlink(temp_output_xml_file)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment