Created
August 5, 2013 10:33
-
-
Save afrendeiro/6154959 to your computer and use it in GitHub Desktop.
Blastx xml output parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# Author: Laurent Manchon (lmanchon@univ-montp2.fr) | |
# Split big blast output in xml format into severals files | |
# Type split_xml_blast without parameters to see usage. | |
BEGIN{ | |
{ | |
if (ARGC==3 && ARGV[1] !~ "^[a-zA-Z]+$") | |
{ | |
# max is number of sequences per output file | |
max = ARGV[1]+0 | |
ARGV[1]="" | |
} else | |
{ | |
assert_exit = 1 | |
usage() | |
} | |
} | |
cpt=nb=1 | |
suffix=".xml" | |
end="</BlastOutput_iterations>\n</BlastOutput>" | |
begin="<?xml version=\"1.0\"?>\n<!DOCTYPE BlastOutput PUBLIC \"-//NCBI//NCBI BlastOutput/EN\" \"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd\">\n" | |
begin=begin "<BlastOutput>\n<BlastOutput_program>blastx</BlastOutput_program>\n<BlastOutput_version>blastx 2.2.18 [Mar-02-2008]</BlastOutput_version>\n" | |
begin=begin "<BlastOutput_reference></BlastOutput_reference>\n<BlastOutput_db>/home/data/blastdb/nr</BlastOutput_db>\n<BlastOutput_query-ID>lcl|1_0</BlastOutput_query-ID>\n" | |
begin=begin "<BlastOutput_query-def></BlastOutput_query-def>\n<BlastOutput_query-len></BlastOutput_query-len>\n<BlastOutput_param>\n<Parameters>\n<Parameters_matrix>BLOSUM62</Parameters_matrix>\n" | |
begin=begin "<Parameters_expect>0.1</Parameters_expect>\n<Parameters_gap-open>11</Parameters_gap-open>\n<Parameters_gap-extend>1</Parameters_gap-extend>\n<Parameters_filter>F</Parameters_filter>\n" | |
begin=begin "</Parameters>\n</BlastOutput_param>\n<BlastOutput_iterations>" | |
} | |
function usage() | |
{ | |
print "###################################################################################" | |
print "# split_xml_blast -- split big blast output in xml format into severals files. #" | |
print "# Performed in Awk v3.1 A.V. Aho, P.J. Weinberger, and B.W. Kernighan #" | |
print "# OS supported: *nix, Windows9x/NT #" | |
print "###################################################################################" | |
print "# Author: Laurent Manchon #" | |
print "# If you have comments or questions, send to the author at: #" | |
print "# lmanchon@univ-montp2.fr #" | |
print "###################################################################################" | |
print "# #" | |
print "# This program takes a file containing blast result in XML format and split #" | |
print "# it into severals small files, as: split_xml_blast <nb> <input_filename> #" | |
print "# with <nb>: Number of sequences per output file #" | |
print "# #" | |
print "###################################################################################" | |
exit 1 | |
} | |
/<Iteration>/{ | |
split(FILENAME,prefix,".") | |
file=prefix[1] "_" | |
output_file=file nb suffix | |
i=1 | |
if(cpt==1){print begin >> output_file} | |
print $0 >> output_file | |
next | |
} | |
i==1{print $0 >> output_file} | |
/<\/Iteration>/{ | |
cpt++ | |
if(cpt==max+1){ | |
print end >> output_file | |
close (output_file) | |
nb++ | |
cpt=1 | |
i=0 | |
next | |
} | |
} | |
END { | |
if (assert_exit) exit 1 | |
print "\nYour input file",FILENAME,"has just been splitted into",nb,"files with",max,"sequences per file:\n" | |
cmd="ls -1 "file"*.xml" | |
system(cmd) | |
close(cmd) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment