Skip to content

Instantly share code, notes, and snippets.

@rickythefox
Created December 2, 2021 15:47
Show Gist options
  • Save rickythefox/9fb2966878c852ce91729f960d48e1eb to your computer and use it in GitHub Desktop.
Save rickythefox/9fb2966878c852ce91729f960d48e1eb to your computer and use it in GitHub Desktop.
HSA xml to pandas dataframe
from lxml import etree
import pandas as pd
def get_hsa_attribute_value(attribute_name, child_element):
local_child_tag_name = etree.QName(child_element).localname
if local_child_tag_name == 'S':
yield attribute_name, child_element.text
elif local_child_tag_name == 'Address':
yield from [(f'{attribute_name}_{ix}', al.text) for ix, al in enumerate(child_element.iterchildren())]
elif local_child_tag_name in ['TimeSpan', 'Coordinate', 'BusinessClassificationType']:
yield from [(f'{attribute_name}_{etree.QName(c).localname}', c.text) for c in child_element.iterchildren()]
else:
yield attribute_name, child_element.text
def get_hsa_attribute(el):
local_tag_name = etree.QName(el).localname
if local_tag_name != 'Attribute':
yield local_tag_name, el.text
for ac in el.iterchildren():
yield from get_hsa_attribute_value(el.get('name'), ac)
def get_hsa_object_dict(hsa_object_element):
attrs = {}
for c in hsa_object_element.iterchildren():
for k, v in get_hsa_attribute(c):
attrs[k] = v
return attrs
def parse_hsa_file(file_name):
with open(file_name, 'r') as f:
xml = etree.parse(f)
hsa_objects = xml.find('{urn:riv:hsa:HsaInformationList:2}HsaObjects')
hsa_dict_list = [get_hsa_object_dict(hsa_object) for hsa_object in hsa_objects.iterchildren()]
df = pd.DataFrame(hsa_dict_list).dropna(axis=1, how='all')
df = df.reindex(sorted(df.columns), axis=1)
print(df.shape)
print(df.describe())
print(df.dtypes)
print(df.memory_usage(index=True).sum())
print(df.isna().sum().sum())
return df
# df.to_excel('publicunits-1004-0900.xlsx')
if __name__ == '__main__':
parse_hsa_file('publicunits-1004-0900.xml')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment