Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Last active December 31, 2015 04:39
Show Gist options
  • Save alexstorer/7935681 to your computer and use it in GitHub Desktop.
Save alexstorer/7935681 to your computer and use it in GitHub Desktop.
Sloppy undocumented way to maybe get column names from an XML file.
import urllib, os, sys, re, glob, pickle
import xml.etree.ElementTree as ET
import csv
# use a dictionary to hold on to each entry. we can define all the dictionary elements and the xml keys that take us there.
def main(argv):
print argv
allNames = set()
for xmlname in glob.glob(sys.argv[1]+'*.xml'):
print xmlname
fname = os.path.splitext(xmlname)[0]
#fname = os.path.splitext(xmlname)[0]
floaded = False
x = open(xmlname,'r')
try:
print "---building parse tree..."
tree = ET.parse(x)
print "---complete!"
for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"):
d = parseSingle(l)
#print "parse complete"
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
#print d.keys()
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
for thiskey in d.keys():
if thiskey not in allNames:
print "New Key:"
print thiskey
allNames = allNames.union(d.keys())
except:
print "Problem parsing ", xmlname
print "------------------------------------"
x.seek(0,2)
print x.tell()
print "------------------------------------"
x.close()
print allNames
fname = 'colnames.pkl'
print fname
f = open(fname, 'wb')
pickle.dump(allNames,f)
f.close()
# parse a single element.
def parseSingle(elt):
# Goal: take everything with no children, and add the text and name to the dictionary.
#print elt
d = dict()
allLeaves = getLeaves(elt,list());
#print allLeaves
for leaf in allLeaves:
#print list(leaf)
#print leaf
#print leaf.text
#print leaf.tag
newtag = re.sub("\{.*?\}","",leaf.tag)
#print newtag
d[newtag] = leaf.text
return d
# return list of leaves
def getLeaves(elt,allLeaves):
#print "Printing children of ", elt
#print "--------------------------------------------------------"
children = list(elt)
#print children
if (len(children)==0):
newtag = re.sub("\{.*?\}","",elt.tag)
if newtag=='entry.content.award.awardID.awardContractID':
print "\n\n\n--------------------------------------------------------"
print 'Tag:'
newtag = re.sub("\{.*?\}","",elt.tag)
print newtag
print 'Text:'
print elt.text
print 'Children:'
print list(elt)
print "--------------------------------------------------------\n\n\n"
if (elt.text != None) and (len(elt.text.strip())>0):
#print "---> tag: ", elt.tag
#print "---> txt: ", elt.text
allLeaves.extend([elt])
#else:
#print "!!!!!!!!! not including zero length text: ", elt.tag
#print allLeaves
#print "Leaves accumulated:", len(allLeaves)
#print allLeaves
else:
#print "\n\nAdding all children recursively..."
nchildren = 0
for i in children:
nchildren+= 1
#print "****** Child ...", nchildren
#nextLeaves = getLeaves(i,allLeaves)
i.tag = elt.tag + "." + i.tag
nextLeaves = getLeaves(i,[])
#print "****** Child ...", nchildren
#print "******** Leaves ...", len(nextLeaves)
#allLeaves.append(getLeaves(i,allLeaves))
allLeaves.extend(nextLeaves)
#print "****** done!", nchildren
return allLeaves
if __name__ == '__main__': sys.exit(main(sys.argv))
import urllib, os, sys, re, glob, pickle
import xml.etree.ElementTree as ET
import csv
# use a dictionary to hold on to each entry. we can define all the dictionary elements and the xml keys that take us there.
def main(argv):
# retrieve pickled list of key entries
try:
fname = sys.argv[2]
f = open(fname, 'r')
print fname
allNames = pickle.load(f)
l = list(allNames)
print "unpickled!"
f.close()
emptyDict = dict.fromkeys(allNames)
print "We expect to have: ", len(emptyDict), "columns in our csv."
except:
print "Problem loading pickled column names."
return
print argv
allNames = set()
for xmlname in glob.glob(sys.argv[1]+'*.xml'):
print xmlname
fname = os.path.splitext(xmlname)[0]
#fname = os.path.splitext(xmlname)[0]
floaded = False
x = open(xmlname,'r')
c = open(fname+'.csv','w')
try:
print "---building parse tree..."
tree = ET.parse(x)
print "---complete!"
for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"):
d = parseSingle(l,emptyDict)
print "Trying to write ", len(d), " columns to csv"
try:
if not(floaded):
# write the header
print "Writing the header..."
dw = csv.DictWriter(c, delimiter=',', fieldnames=d.keys())
dw.writerow(dict((fn,fn) for fn in dw.fieldnames))
floaded = True
dw.writerow(d)
except:
print "Problem writing csv."
#print "parse complete"
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
#print d.keys()
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
except:
print "Problem parsing ", xmlname
print "------------------------------------"
x.seek(0,2)
print x.tell()
print "------------------------------------"
x.close()
#print allNames
#fname = 'colnames.pkl'
#print fname
#f = open(fname, 'wb')
#pickle.dump(allNames,f)
#f.close()
# parse a single element.
def parseSingle(elt,d):
# Goal: take everything with no children, and add the text and name to the dictionary.
#print elt
allLeaves = getLeaves(elt,list());
#print allLeaves
for leaf in allLeaves:
#print list(leaf)
#print leaf
#print leaf.text
#print leaf.tag
newtag = re.sub("\{.*?\}","",leaf.tag)
#print newtag
if not(newtag in d):
print newtag
d[newtag] = leaf.text
return d
# return list of leaves
def getLeaves(elt,allLeaves):
#print "Printing children of ", elt
#print "--------------------------------------------------------"
children = list(elt)
#print children
if len(children)==0:
#print "Adding ", elt, " to list..."
#print elt.text
if (elt.text != None) and len(elt.text.strip())>0:
#print "---> tag: ", elt.tag
#print "---> txt: ", elt.text
allLeaves.extend([elt])
#else:
#print "!!!!!!!!! not including zero length text: ", elt.tag
#print allLeaves
#print "Leaves accumulated:", len(allLeaves)
#print allLeaves
else:
#print "\n\nAdding all children recursively..."
nchildren = 0
for i in children:
nchildren+= 1
#print "****** Child ...", nchildren
#nextLeaves = getLeaves(i,allLeaves)
i.tag = elt.tag + "." + i.tag
nextLeaves = getLeaves(i,[])
#print "****** Child ...", nchildren
#print "******** Leaves ...", len(nextLeaves)
#allLeaves.append(getLeaves(i,allLeaves))
allLeaves.extend(nextLeaves)
#print "****** done!", nchildren
return allLeaves
if __name__ == '__main__': sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment