Skip to content

Instantly share code, notes, and snippets.

@poltys
Created September 15, 2020 09:22
Show Gist options
  • Save poltys/bf9efe99e23df43dca2bfa85b6527af3 to your computer and use it in GitHub Desktop.
Save poltys/bf9efe99e23df43dca2bfa85b6527af3 to your computer and use it in GitHub Desktop.
In (a) given folder(s) read all xml files and map the columns in a dataframe
import pandas as pd
import os
import glob
import io
import xml.etree.ElementTree as ET
def read_path(path):
folder_name=path
all_files = glob.glob(path + "/*.xml")
files=[]
super_tree=[]
super_root=[]
result=[]
for filename in all_files:
files.append(filename)
tree = ET.parse(filename)
root = tree.getroot()
super_root.append(root)
for elem in super_root:
all_descendants = [e.tag.split('}', 1)[1] for e in elem.iter()]
result.append(all_descendants)
data_tuples = list(zip(files,result))
df = pd.DataFrame.from_records(data_tuples, columns=[folder_name, 'list_column'])
df.to_excel(path + '-mapping.xlsx')
return(df)
path = # input your path
folders = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
for folder in d:
folders.append(os.path.join(r, folder))
for f in folders:
print(f)
for f in folders:
read_path(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment