ryansmccoy/read_excel_xml.py

## read_excel_xml.py

from bs4 import BeautifulSoup

def read_excel_xml(filepath):
    """
    Read in XLS file formatted as XML

    :param filepath:
    :return:
    """
    try:
        file = open(filepath, encoding='utf-8-sig').read()
    except:
        file = open(filepath, encoding='utf-8').read()

    soup = BeautifulSoup(file, 'xml')
    workbook = []
    for sheet in soup.findAll('Worksheet'):
        sheet_as_list = []
        for row in sheet.findAll('Row'):
            row_as_list = []
            for cell in row.findAll('Cell'):
                try:
                    cell_data = cell.Data.text
                except:
                    cell_data = ""
                row_as_list.append(cell_data)
            sheet_as_list.append(row_as_list)
        workbook.append(sheet_as_list)
    return pd.DataFrame(workbook[0])

folder = r'c:\\grid_files'
files = glob.glob(folder + "\\*.xls")

def main():

    for idx, file in list(df_grids.iterrows()):
        new_filepath = file.filepath
        print(new_filepath)
        if file.filepath.endswith('.xls') | file.filepath.endswith('.xlsx'):
            try:
                date_mod = os.path.getmtime(file.filepath)
                # date_mod = os.path.getmtime(filepath)
                date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')

                print(f"\nexcel file \t{file.filepath}\n")
                df = pd.read_excel(file.filepath)
                print(df.head())
                columns = df.columns.tolist()
                new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath))

            except:
                print(f"\nxml file \t{file.filepath}\n")
                try:
                    date_mod = os.path.getmtime(file.filepath)
                    date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')
                    new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath.replace(".xls", ".xlsx")))
                    workbook = read_excel_xml(file.filepath)
                    # workbook = read_excel_xml(filepath)
                    df = pd.DataFrame(workbook[0])

                    col_row = 0
                    col_row_found = False
                    num_unique_cols = 0

                    while col_row_found is False:
                        df.columns = df.iloc[0, :]
                        df = df.iloc[1:, :]
                        columns = df.columns
                        num_unique_cols = len(set(columns.drop_duplicates().dropna()))
                        print(columns)

                        if num_unique_cols < 4:
                            print("only found ", num_unique_cols, " columns")
                        else:
                            col_row_found = True

                        if col_row > 4:
                            col_row_found = True

                    print(df.head())
                    print("\n",new_filepath)
                    df.to_excel(new_filepath)

                except Exception as e:
                    print(e)
                    columns = "ERROR"
                    error_files.append(file.filepath)

            df_grids.iloc[idx,2:] = str(columns)
            df_grids.iloc[idx,3:] = new_filepath

        elif file.filepath.endswith('.csv'):
            print(f"csv file \t{file.filepath}")


filepath = "grid1_vywyaf1x.xls"
df = read_excel_xml(filepath)

	from bs4 import BeautifulSoup

	def read_excel_xml(filepath):
	"""
	Read in XLS file formatted as XML

	:param filepath:
	:return:
	"""
	try:
	file = open(filepath, encoding='utf-8-sig').read()
	except:
	file = open(filepath, encoding='utf-8').read()

	soup = BeautifulSoup(file, 'xml')
	workbook = []
	for sheet in soup.findAll('Worksheet'):
	sheet_as_list = []
	for row in sheet.findAll('Row'):
	row_as_list = []
	for cell in row.findAll('Cell'):
	try:
	cell_data = cell.Data.text
	except:
	cell_data = ""
	row_as_list.append(cell_data)
	sheet_as_list.append(row_as_list)
	workbook.append(sheet_as_list)
	return pd.DataFrame(workbook[0])

	folder = r'c:\\grid_files'
	files = glob.glob(folder + "\\*.xls")

	def main():

	for idx, file in list(df_grids.iterrows()):
	new_filepath = file.filepath
	print(new_filepath)
	if file.filepath.endswith('.xls') \| file.filepath.endswith('.xlsx'):
	try:
	date_mod = os.path.getmtime(file.filepath)
	# date_mod = os.path.getmtime(filepath)
	date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')

	print(f"\nexcel file \t{file.filepath}\n")
	df = pd.read_excel(file.filepath)
	print(df.head())
	columns = df.columns.tolist()
	new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath))

	except:
	print(f"\nxml file \t{file.filepath}\n")
	try:
	date_mod = os.path.getmtime(file.filepath)
	date_mod_stf = datetime.fromtimestamp(date_mod).strftime('%Y-%m-%dT%H-%M-%SZ')
	new_filepath = os.path.join(new_folder, date_mod_stf + "_" + os.path.basename(file.filepath.replace(".xls", ".xlsx")))
	workbook = read_excel_xml(file.filepath)
	# workbook = read_excel_xml(filepath)
	df = pd.DataFrame(workbook[0])

	col_row = 0
	col_row_found = False
	num_unique_cols = 0

	while col_row_found is False:
	df.columns = df.iloc[0, :]
	df = df.iloc[1:, :]
	columns = df.columns
	num_unique_cols = len(set(columns.drop_duplicates().dropna()))
	print(columns)

	if num_unique_cols < 4:
	print("only found ", num_unique_cols, " columns")
	else:
	col_row_found = True

	if col_row > 4:
	col_row_found = True

	print(df.head())
	print("\n",new_filepath)
	df.to_excel(new_filepath)

	except Exception as e:
	print(e)
	columns = "ERROR"
	error_files.append(file.filepath)

	df_grids.iloc[idx,2:] = str(columns)
	df_grids.iloc[idx,3:] = new_filepath

	elif file.filepath.endswith('.csv'):
	print(f"csv file \t{file.filepath}")



	filepath = "grid1_vywyaf1x.xls"
	df = read_excel_xml(filepath)