Skip to content

Instantly share code, notes, and snippets.

@ryansmccoy
Last active October 26, 2019 05:44
Show Gist options
  • Save ryansmccoy/403182c0ba5c82a946553058e15bccdc to your computer and use it in GitHub Desktop.
Save ryansmccoy/403182c0ba5c82a946553058e15bccdc to your computer and use it in GitHub Desktop.
Read in XLS file formatted as XML
from bs4 import BeautifulSoup
def read_excel_xml(filepath):
"""
Read in XLS file formatted as XML
:param filepath:
:return:
"""
try:
file = open(filepath, encoding='utf-8-sig').read()
except:
file = open(filepath, encoding='utf-8').read()
soup = BeautifulSoup(file, 'xml')
workbook = []
for sheet in soup.findAll('Worksheet'):
sheet_as_list = []
for row in sheet.findAll('Row'):
row_as_list = []
for cell in row.findAll('Cell'):
try:
cell_data = cell.Data.text
except:
cell_data = ""
row_as_list.append(cell_data)
sheet_as_list.append(row_as_list)
workbook.append(sheet_as_list)
return pd.DataFrame(workbook[0])
folder = r'c:\\grid_files'
files = glob.glob(folder + "\\*.xls")
def main():
for idx, file in list(df_grids.iterrows()):
new_filepath = file.filepath
print(new_filepath)
if file.filepath.endswith('.xls') | file.filepath.endswith('.xlsx'):
try:
date_mod = os.path.getmtime(file.filepath)
# date_mod = os.path.getmtime(filepath)
date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')
print(f"\nexcel file \t{file.filepath}\n")
df = pd.read_excel(file.filepath)
print(df.head())
columns = df.columns.tolist()
new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath))
except:
print(f"\nxml file \t{file.filepath}\n")
try:
date_mod = os.path.getmtime(file.filepath)
date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')
new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath.replace(".xls", ".xlsx")))
workbook = read_excel_xml(file.filepath)
# workbook = read_excel_xml(filepath)
df = pd.DataFrame(workbook[0])
col_row = 0
col_row_found = False
num_unique_cols = 0
while col_row_found is False:
df.columns = df.iloc[0, :]
df = df.iloc[1:, :]
columns = df.columns
num_unique_cols = len(set(columns.drop_duplicates().dropna()))
print(columns)
if num_unique_cols < 4:
print("only found ", num_unique_cols, " columns")
else:
col_row_found = True
if col_row > 4:
col_row_found = True
print(df.head())
print("\n",new_filepath)
df.to_excel(new_filepath)
except Exception as e:
print(e)
columns = "ERROR"
error_files.append(file.filepath)
df_grids.iloc[idx,2:] = str(columns)
df_grids.iloc[idx,3:] = new_filepath
elif file.filepath.endswith('.csv'):
print(f"csv file \t{file.filepath}")
filepath = "grid1_vywyaf1x.xls"
df = read_excel_xml(filepath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment