Last active
February 19, 2023 04:33
-
-
Save doshiraki/3f23cf44d41d50b2b60d9b966244360e to your computer and use it in GitHub Desktop.
excel to tsv by python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from zipfile import ZipFile | |
from lxml import etree | |
import datetime | |
def loopxml(zr:ZipFile, name:str): | |
parser = etree.XMLPullParser(['start', 'end']) | |
level = 0 | |
stack = [] | |
with zr.open(zr.getinfo(name)) as reader: | |
while True: | |
buf = reader.read(8192) | |
if len(buf) <= 0: | |
break | |
parser.feed(buf) | |
for event, elem in parser.read_events(): | |
if event == "end": | |
yield event, elem, stack | |
stack.pop() | |
if 1 <= len(stack) <= 2: | |
stack[-1].remove(elem) | |
level -= 1 | |
else: | |
level += 1 | |
stack.append(elem) | |
yield event, elem, stack | |
def main(argv): | |
fileName = argv[1] | |
zr = ZipFile(fileName) | |
sheets = [] | |
for event, elem, stack in loopxml(zr, "xl/workbook.xml"): | |
if event == "start": | |
if elem.tag.endswith("}sheet"): | |
sheets.append(elem.attrib["sheetId"]) | |
cellvalues = [] | |
endhandler = None | |
for event, elem, stack in loopxml(zr, "xl/sharedStrings.xml"): | |
#print(elem) | |
if event == "end": | |
if endhandler and endhandler(): | |
endhandler = None | |
else: | |
if elem.tag.endswith("}si"): | |
ww = [""] | |
elif elem.tag.endswith("}sst"): | |
def endhandler(): | |
if elem.tag.endswith("}t"): | |
ww[0] += elem.text | |
elif elem.tag.endswith("}si"): | |
w = ww[0] | |
if any((x in "\n\r\t" or x.startswith("\"") and x.endswith("\"") for x in w)): | |
w = "\"{}\"".format(w.replace("\"", "\"\"")) | |
cellvalues.append(w) | |
elif elem.tag.endswith("}sst"): | |
return True | |
return False | |
numFmtIDs = dict() | |
styles = [] | |
endhandler = None | |
for event, elem, stack in loopxml(zr, "xl/styles.xml"): | |
if event == "end": | |
if endhandler and endhandler(): | |
endhandler = None | |
else: | |
if elem.tag.endswith("}cellXfs"): | |
def endhandler(): | |
if elem.tag.endswith("}xf"): | |
styles.append(numFmtIDs[elem.attrib["numFmtId"]]) | |
else: | |
return elem.tag.endswith("}cellXfs") | |
return False | |
elif elem.tag.endswith("}numFmts"): | |
def endhandler(): | |
if elem.tag.endswith("}numFmt"): | |
fc = elem.attrib["formatCode"] | |
fmt = "s" | |
if fc == "General": | |
pass | |
elif "0" in fc: | |
fmt = "n" | |
elif not "@" in fc: | |
fmt = "d" | |
numFmtIDs[elem.attrib["numFmtId"]] = (fmt, fc) | |
else: | |
return elem.tag.endswith("}numFmts") | |
return False | |
def coladdress(addr): | |
col = 0 | |
for i, x in enumerate(addr): | |
w = ord(x) - 0x41 | |
if 0 <= w < 26: | |
col = col * 26 + w | |
continue | |
break | |
ca = 0 | |
for j in range(1, i): | |
ca *= 26 | |
ca += 26 | |
col += ca | |
return col | |
for sheetId in sheets: | |
row = 0 | |
cells = [] | |
endhandler = None | |
emptyrow = [None] | |
for event, elem, stack in loopxml(zr, "xl/worksheets/sheet{}.xml".format(sheetId)): | |
if event == "end": | |
if endhandler and endhandler(): | |
endhandler = None | |
else: | |
if elem.tag.endswith("}row"): | |
row += 1 | |
nextrow = int(elem.attrib["r"]) | |
for i in range(row, nextrow): | |
print("\t".join(emptyrow[0])) | |
row = nextrow | |
cells = emptyrow[0].copy() | |
elif elem.tag.endswith("}dimension"): | |
def endhandler(): | |
if elem.tag.endswith("}dimension"): | |
ref = elem.attrib["ref"] | |
cells = ref.split(":") if ":" in ref else [ref, ref] | |
cols = coladdress(cells[1]) + 1 | |
emptyrow[0] = [""] * cols | |
return True | |
elif elem.tag.endswith("}sheetData"): | |
def endhandler(): | |
if elem.tag.endswith("}v"): | |
val = elem.text | |
cell = stack[-2].attrib | |
style = styles[int(cell["s"])] | |
if cell["t"] == "n": | |
if style[0] != "s": | |
if style[0] == "d": | |
val = datetime.datetime(1899, 12, 30) + datetime.timedelta(days=float(val)) | |
val -= datetime.timedelta(microseconds=val.microsecond) | |
if val == datetime.datetime(val.year, val.month, val.day): | |
val = "{:%Y/%m/%d}".format(val) | |
else: | |
val = "{:%Y/%m/%d %H:%M:%S}".format(val) | |
elif style[0] == "n": | |
if style[1].endswith("%"): | |
val = "{}%".format(float(val) * 100) | |
#val = "{}({})".format(val, style) | |
else: | |
val = cellvalues[int(val)] | |
cells[coladdress(cell["r"])] = val | |
elif elem.tag.endswith("}row"): | |
print("\t".join(cells)) | |
elif elem.tag.endswith("}sheetData"): | |
return True | |
return False | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment