Last active
October 8, 2022 06:49
-
-
Save avrilcoghlan/8687938 to your computer and use it in GitHub Desktop.
Python script for merging optical map xml files (for different scaffolds) into one large xml file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from xml.etree import ElementTree as ET | |
import AvrilFileUtils | |
class Error (Exception): pass | |
#====================================================================# | |
# define a function to merge optical map xml files for different scaffolds. | |
def merge_optical_map_xml_files(input_file_list, output_xml_file): | |
"""Merge the optical map xml files in a directory into one output xml file""" | |
# get a list of all the input files: | |
filenames = [] | |
fileObj = open(input_file_list, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
temp = line.split() | |
filename = temp[0] | |
filenames.append(filename) | |
fileObj.close() | |
# combine all the xml files: | |
# copied code from https://gist.github.com/cgoldberg/4320815: | |
seen_experiment = False | |
cases = [] | |
for filename in filenames: | |
print("reading filename %s" % filename) | |
tree = ET.parse(filename) | |
root = tree.getroot() # 'aligned_maps_document' | |
for child in root: | |
if child.tag == 'experiment': | |
if seen_experiment == True: | |
root.remove(child) | |
else: | |
# keep one 'experiment' child. Note: I think this is necessary to write out the data with ElementTree | |
seen_experiment = True | |
cases.append(root.getchildren()) | |
# write out the new tree: | |
new_root = ET.Element('aligned_maps_document') | |
for case in cases: | |
new_root.extend(case) | |
new_tree = ET.ElementTree(new_root) | |
# see comments at http://stackoverflow.com/questions/12612648/python-element-tree-is-removing-the-xml-declaration | |
new_tree.write(output_xml_file, encoding="iso-8859-1", xml_declaration=True, method="xml") | |
# open the output xml file, so we can append a new line character (for some reason the | |
# ElementTree object does not seem to end the file with a new line character): | |
outputfileObj = open(output_xml_file, "a") | |
outputfileObj.write('\n') | |
outputfileObj.close() | |
return | |
#====================================================================# | |
# define a function to rewrite the header of 'output_xml_file', so that | |
# it can be read by MapSolver | |
def rewrite_optical_map_xml_header(temp_output_xml_file, output_xml_file): | |
"""rewrite the header of the optical map xml file, so that it can be read by MapSolver""" | |
# open the output file: | |
outputfileObj = open(output_xml_file, "w") | |
# write the header in the output file: | |
outputfileObj.write('<?xml version="1.0" encoding="iso-8859-1"?>\n') | |
outputfileObj.write('<!-- @version:0.1 -->\n\n') | |
outputfileObj.write('<!-- this is a comment -->\n') | |
outputfileObj.write('<aligned_maps_document version="0.5">\n\n') | |
outputfileObj.write('<experiment>\n') | |
# read in temp_output_xml_file: | |
found_start = False | |
fileObj = open(temp_output_xml_file, "r") | |
for line in fileObj: | |
if '<creator>gentig2</creator>' in line: | |
found_start = True | |
if found_start == True: | |
outputfileObj.write(line) | |
fileObj.close() | |
outputfileObj.close() | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False: | |
print("Usage: %s input_file_list output_xml_file" % sys.argv[0]) | |
sys.exit(1) | |
input_file_list = sys.argv[1] | |
output_xml_file = sys.argv[2] | |
# merge the optical map xml files for different scaffolds: | |
temp_output_xml_file = AvrilFileUtils.make_filename(os.getcwd()) | |
merge_optical_map_xml_files(input_file_list, temp_output_xml_file) | |
# rewrite the header of the 'output_xml_file' file so that it can be read by MapSolver: | |
rewrite_optical_map_xml_header(temp_output_xml_file, output_xml_file) | |
# delete temporary files: | |
os.unlink(temp_output_xml_file) | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment