Skip to content

Instantly share code, notes, and snippets.

@ahlusar1989
Last active September 3, 2015 02:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahlusar1989/de2381c1fb77e96ae601 to your computer and use it in GitHub Desktop.
Save ahlusar1989/de2381c1fb77e96ae601 to your computer and use it in GitHub Desktop.
from glob import iglob
import csv
from collections import *
import os, sys
import csv
import itertools
from csv import reader
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
def just_xml_data(xml_string):
xml_string = xml_string.replace('&', '')
try:
root = ElementTree.XML(xml_string) #serialize and parse into XML object
except ElementTree.ParseError as v:
row, column = v.position
print("error on row", row, "column", column, ":", v)
pass
else:
xml = dict(flatten_dict(root))
return xml
headers = set()
rows = []
with open("output_5.csv", "r") as input_file:
dr = csv.DictReader(input_file)
for row in dr:
# save the value of the XML field, since we're going to want to work
# on this field a little more
xml_field = row["CLIENT_RESP_DATA"]
#for each row apply generator for XML to Dictionary conversion
xml_data = just_xml_data(xml_field)
# this merges the XML data into the row. NOTE that it doesn't do anything
# to ensure that the field names in the XML don't match (and override) the
# field names already in the dictionary
if xml_data is not None:
row.update(xml_data)
# ensure that the headers have all the right fields
headers.update(row.keys())
# now we save the row as its own entity into the rows list. by keeping this
# row's data separate and not merging it into a larger data-set, we avoid
# confusing ourselves.
rows.append(row)
else:
pass
# now we write out our CSV. note that this is a _separate_ pass from loading the
# data in, since we rely on global state such as the headers list.
with open("output_5_test.csv", "w") as output_file:
wr = csv.writer(output_file)
# First, we write out the list of headers -- since this is the global
# list, it will include columns added through the XML parsing on each
# input row
csv_headers = list(headers)
wr.writerow(csv_headers)
# Now we write out each row. Since we have stored each row (with its
# own set of fields) separately, we know each row will have data for
# a subset of the fields.
for row in rows:
# Each row needs to have a value for each field in the whole CSV.
# If that fieldwasn't in the XML in that row, then we put in None.
values = []
# Note that since we wrote out `csv_headers` as the header row
# earlier, we need to _always_ iterate over it to keep each row
# consistent with the headers
for field in csv_headers:
# if the field isn't in the row, return None
value = row.get(field, None)
values.append(value)
# Now write it out
wr.writerow(values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment