-
-
Save ahlusar1989/de2381c1fb77e96ae601 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from glob import iglob | |
import csv | |
from collections import * | |
import os, sys | |
import csv | |
import itertools | |
from csv import reader | |
import xml.etree.cElementTree as ElementTree | |
from xml.etree.ElementTree import XMLParser | |
def just_xml_data(xml_string): | |
xml_string = xml_string.replace('&', '') | |
try: | |
root = ElementTree.XML(xml_string) #serialize and parse into XML object | |
except ElementTree.ParseError as v: | |
row, column = v.position | |
print("error on row", row, "column", column, ":", v) | |
pass | |
else: | |
xml = dict(flatten_dict(root)) | |
return xml | |
headers = set() | |
rows = [] | |
with open("output_5.csv", "r") as input_file: | |
dr = csv.DictReader(input_file) | |
for row in dr: | |
# save the value of the XML field, since we're going to want to work | |
# on this field a little more | |
xml_field = row["CLIENT_RESP_DATA"] | |
#for each row apply generator for XML to Dictionary conversion | |
xml_data = just_xml_data(xml_field) | |
# this merges the XML data into the row. NOTE that it doesn't do anything | |
# to ensure that the field names in the XML don't match (and override) the | |
# field names already in the dictionary | |
if xml_data is not None: | |
row.update(xml_data) | |
# ensure that the headers have all the right fields | |
headers.update(row.keys()) | |
# now we save the row as its own entity into the rows list. by keeping this | |
# row's data separate and not merging it into a larger data-set, we avoid | |
# confusing ourselves. | |
rows.append(row) | |
else: | |
pass | |
# now we write out our CSV. note that this is a _separate_ pass from loading the | |
# data in, since we rely on global state such as the headers list. | |
with open("output_5_test.csv", "w") as output_file: | |
wr = csv.writer(output_file) | |
# First, we write out the list of headers -- since this is the global | |
# list, it will include columns added through the XML parsing on each | |
# input row | |
csv_headers = list(headers) | |
wr.writerow(csv_headers) | |
# Now we write out each row. Since we have stored each row (with its | |
# own set of fields) separately, we know each row will have data for | |
# a subset of the fields. | |
for row in rows: | |
# Each row needs to have a value for each field in the whole CSV. | |
# If that fieldwasn't in the XML in that row, then we put in None. | |
values = [] | |
# Note that since we wrote out `csv_headers` as the header row | |
# earlier, we need to _always_ iterate over it to keep each row | |
# consistent with the headers | |
for field in csv_headers: | |
# if the field isn't in the row, return None | |
value = row.get(field, None) | |
values.append(value) | |
# Now write it out | |
wr.writerow(values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment