Skip to content

Instantly share code, notes, and snippets.

@beatmadsen
Last active August 15, 2020 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beatmadsen/2479a713610bf5263721b0dd8da55528 to your computer and use it in GitHub Desktop.
Save beatmadsen/2479a713610bf5263721b0dd8da55528 to your computer and use it in GitHub Desktop.
python 3 parse badly encoded excel sheet
# Solution inspired by https://stackoverflow.com/a/33504236/2020801
from xml.sax import ContentHandler, parseString
class ExcelHandler(ContentHandler):
def __init__(self):
self.chars = []
self.cells = []
self.rows = []
self.table = []
self.sheet_name = None
self.worksheets = {}
def characters(self, content):
self.chars.append(content)
def startElement(self, name, attrs):
if name == "ss:Cell":
self.chars = []
elif name == "ss:Row":
self.cells = []
elif name == "ss:Table":
self.rows = []
elif name == "ss:Worksheet":
self.table = []
self.sheet_name = attrs.getValue("ss:Name")
def endElement(self, name):
if name == "ss:Cell":
self.cells.append(''.join(self.chars))
elif name == "ss:Row":
self.rows.append(self.cells)
elif name == "ss:Table":
self.table.append(self.rows)
elif name == "ss:Worksheet":
self.worksheets[self.sheet_name] = self.table
self.sheet_name = None
# Usage:
# 1. the file is corrupt in that it has two BOM characters instead of the expected one.
# by specifying encoding we'll at least get rid of one. The parser will be able to disregard the remaining one.
f = open('iShares-Russell-2000-ETF_fund.xls', encoding='utf-8-sig')
s = f.read()
# 2. instantiate parser handler and parse string
excel_handler = ExcelHandler()
parseString(s, excel_handler)
# 3. access data saved on handler after parsing.
# table in the worksheet with name 'Performance'
# NB: worksheets is a dict unlike in previous versions
table = excel_handler.worksheets['Performance']
# contents of second row, third cell
text_in_cell = table[1][2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment