Skip to content

Instantly share code, notes, and snippets.

@doshiraki
Last active February 19, 2023 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doshiraki/3f23cf44d41d50b2b60d9b966244360e to your computer and use it in GitHub Desktop.
Save doshiraki/3f23cf44d41d50b2b60d9b966244360e to your computer and use it in GitHub Desktop.
excel to tsv by python
import sys
from zipfile import ZipFile
from lxml import etree
import datetime
def loopxml(zr:ZipFile, name:str):
parser = etree.XMLPullParser(['start', 'end'])
level = 0
stack = []
with zr.open(zr.getinfo(name)) as reader:
while True:
buf = reader.read(8192)
if len(buf) <= 0:
break
parser.feed(buf)
for event, elem in parser.read_events():
if event == "end":
yield event, elem, stack
stack.pop()
if 1 <= len(stack) <= 2:
stack[-1].remove(elem)
level -= 1
else:
level += 1
stack.append(elem)
yield event, elem, stack
def main(argv):
fileName = argv[1]
zr = ZipFile(fileName)
sheets = []
for event, elem, stack in loopxml(zr, "xl/workbook.xml"):
if event == "start":
if elem.tag.endswith("}sheet"):
sheets.append(elem.attrib["sheetId"])
cellvalues = []
endhandler = None
for event, elem, stack in loopxml(zr, "xl/sharedStrings.xml"):
#print(elem)
if event == "end":
if endhandler and endhandler():
endhandler = None
else:
if elem.tag.endswith("}si"):
ww = [""]
elif elem.tag.endswith("}sst"):
def endhandler():
if elem.tag.endswith("}t"):
ww[0] += elem.text
elif elem.tag.endswith("}si"):
w = ww[0]
if any((x in "\n\r\t" or x.startswith("\"") and x.endswith("\"") for x in w)):
w = "\"{}\"".format(w.replace("\"", "\"\""))
cellvalues.append(w)
elif elem.tag.endswith("}sst"):
return True
return False
numFmtIDs = dict()
styles = []
endhandler = None
for event, elem, stack in loopxml(zr, "xl/styles.xml"):
if event == "end":
if endhandler and endhandler():
endhandler = None
else:
if elem.tag.endswith("}cellXfs"):
def endhandler():
if elem.tag.endswith("}xf"):
styles.append(numFmtIDs[elem.attrib["numFmtId"]])
else:
return elem.tag.endswith("}cellXfs")
return False
elif elem.tag.endswith("}numFmts"):
def endhandler():
if elem.tag.endswith("}numFmt"):
fc = elem.attrib["formatCode"]
fmt = "s"
if fc == "General":
pass
elif "0" in fc:
fmt = "n"
elif not "@" in fc:
fmt = "d"
numFmtIDs[elem.attrib["numFmtId"]] = (fmt, fc)
else:
return elem.tag.endswith("}numFmts")
return False
def coladdress(addr):
col = 0
for i, x in enumerate(addr):
w = ord(x) - 0x41
if 0 <= w < 26:
col = col * 26 + w
continue
break
ca = 0
for j in range(1, i):
ca *= 26
ca += 26
col += ca
return col
for sheetId in sheets:
row = 0
cells = []
endhandler = None
emptyrow = [None]
for event, elem, stack in loopxml(zr, "xl/worksheets/sheet{}.xml".format(sheetId)):
if event == "end":
if endhandler and endhandler():
endhandler = None
else:
if elem.tag.endswith("}row"):
row += 1
nextrow = int(elem.attrib["r"])
for i in range(row, nextrow):
print("\t".join(emptyrow[0]))
row = nextrow
cells = emptyrow[0].copy()
elif elem.tag.endswith("}dimension"):
def endhandler():
if elem.tag.endswith("}dimension"):
ref = elem.attrib["ref"]
cells = ref.split(":") if ":" in ref else [ref, ref]
cols = coladdress(cells[1]) + 1
emptyrow[0] = [""] * cols
return True
elif elem.tag.endswith("}sheetData"):
def endhandler():
if elem.tag.endswith("}v"):
val = elem.text
cell = stack[-2].attrib
style = styles[int(cell["s"])]
if cell["t"] == "n":
if style[0] != "s":
if style[0] == "d":
val = datetime.datetime(1899, 12, 30) + datetime.timedelta(days=float(val))
val -= datetime.timedelta(microseconds=val.microsecond)
if val == datetime.datetime(val.year, val.month, val.day):
val = "{:%Y/%m/%d}".format(val)
else:
val = "{:%Y/%m/%d %H:%M:%S}".format(val)
elif style[0] == "n":
if style[1].endswith("%"):
val = "{}%".format(float(val) * 100)
#val = "{}({})".format(val, style)
else:
val = cellvalues[int(val)]
cells[coladdress(cell["r"])] = val
elif elem.tag.endswith("}row"):
print("\t".join(cells))
elif elem.tag.endswith("}sheetData"):
return True
return False
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment