Skip to content

Instantly share code, notes, and snippets.

@porterjamesj
Last active August 29, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save porterjamesj/9946927 to your computer and use it in GitHub Desktop.
Save porterjamesj/9946927 to your computer and use it in GitHub Desktop.
horrible regex abuse i used to get the analysis ids corresponding to pipeline outputs
import os
import re
xml_file = sys.argv[1]
tree = ET.parse(xml_file)
root = tree.getroot()
def munge(root):
res = []
for result in root.iter("Result"):
for a in result.find("files"):
fname = a.find("filename").text
munged = fname.split(".")[2]
res.append(munged)
return res
def pair(root, files):
ffiles = []
ids = []
for f in files:
print f
num = re.search("([0-9])$", f).groups()[0]
try:
bases = "_" + re.search("([ACGT]{6,6})", f).groups()[0]
except:
bases = ""
newre = re.sub("XX_.*", "", f) + ".*" + num + bases
print newre
for result in root.iter("Result"):
aid = result.find("analysis_id").text
a = result.find("files")[0]
fname = a.find("filename").text
munged = fname.split(".")[2]
if re.search(newre, munged) is not None:
print (f, aid)
ffiles.append(f)
ids.append(aid)
break
else:
print "no matching analysis found for " + f
print munged, newre, f
return ffiles, ids
return ffiles, ids
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment