Last active
August 15, 2020 18:34
-
-
Save beatmadsen/2479a713610bf5263721b0dd8da55528 to your computer and use it in GitHub Desktop.
python 3 parse badly encoded excel sheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Solution inspired by https://stackoverflow.com/a/33504236/2020801 | |
from xml.sax import ContentHandler, parseString | |
class ExcelHandler(ContentHandler): | |
def __init__(self): | |
self.chars = [] | |
self.cells = [] | |
self.rows = [] | |
self.table = [] | |
self.sheet_name = None | |
self.worksheets = {} | |
def characters(self, content): | |
self.chars.append(content) | |
def startElement(self, name, attrs): | |
if name == "ss:Cell": | |
self.chars = [] | |
elif name == "ss:Row": | |
self.cells = [] | |
elif name == "ss:Table": | |
self.rows = [] | |
elif name == "ss:Worksheet": | |
self.table = [] | |
self.sheet_name = attrs.getValue("ss:Name") | |
def endElement(self, name): | |
if name == "ss:Cell": | |
self.cells.append(''.join(self.chars)) | |
elif name == "ss:Row": | |
self.rows.append(self.cells) | |
elif name == "ss:Table": | |
self.table.append(self.rows) | |
elif name == "ss:Worksheet": | |
self.worksheets[self.sheet_name] = self.table | |
self.sheet_name = None | |
# Usage: | |
# 1. the file is corrupt in that it has two BOM characters instead of the expected one. | |
# by specifying encoding we'll at least get rid of one. The parser will be able to disregard the remaining one. | |
f = open('iShares-Russell-2000-ETF_fund.xls', encoding='utf-8-sig') | |
s = f.read() | |
# 2. instantiate parser handler and parse string | |
excel_handler = ExcelHandler() | |
parseString(s, excel_handler) | |
# 3. access data saved on handler after parsing. | |
# table in the worksheet with name 'Performance' | |
# NB: worksheets is a dict unlike in previous versions | |
table = excel_handler.worksheets['Performance'] | |
# contents of second row, third cell | |
text_in_cell = table[1][2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment