Last active
December 29, 2021 22:11
-
-
Save Gribouillis/e4360ea3b0dbd0e37f4c13fa276adc8e to your computer and use it in GitHub Desktop.
Spreadsheet to dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import namedtuple | |
import pathlib | |
import pickle | |
import subprocess as sp | |
import tempfile | |
__version__ = '2019.01.16.2' | |
__all__ = ['spreadsheet_to_dict',] | |
WorksheetKey = namedtuple('WorksheetKey', "index name") | |
def spreadsheet_to_dict(filename, cachename=None): | |
"""Return a dict: {(index, name): <list of tuples>} | |
Arguments | |
filename: a path to a .ods or .xlsx file. It may also work | |
with .csv and .xls and others | |
cachename: if not None, a path to a pickle file where the | |
python dictionary will be stored. If this file exists | |
and its modification time is more recent than that of | |
the spreadsheet file, it is read directly instead of | |
the spreadsheet. | |
Worksheet keys are namedtuples with members .index and .name | |
see also: libreoffice, openpyxl | |
""" | |
filename = pathlib.Path(filename) | |
if cachename: | |
cachename = pathlib.Path(cachename) | |
if cachename.is_file() and ( | |
cachename.stat().st_mtime > filename.stat().st_mtime): | |
return load_cache(cachename) | |
if filename.suffix == '.xlsx': | |
return _xlsx_to_dict(filename, cachename) | |
with tempfile.TemporaryDirectory() as tmpd: | |
tmp = pathlib.Path(tmpd) | |
userdir = tmp/'.user' | |
userdir.mkdir() | |
convert_to_xlsx = [ | |
'soffice', | |
# avoid possible interaction with running instance of libreoffice | |
'-env:UserInstallation={}'.format(userdir.as_uri()), | |
'--headless', | |
'--convert-to', | |
# 'xlsx:"Calc MS Excel 2007 XML"', # <- doesn't work | |
# 'xlsx', # <- works | |
'xlsx:Calc MS Excel 2007 XML', # <- also works | |
'--outdir', | |
str(tmp), | |
str(filename)] | |
sp.run(convert_to_xlsx, check=True, stdout=sp.DEVNULL) | |
return _xlsx_to_dict( | |
(tmp/filename.name).with_suffix('.xlsx'), cachename) | |
def _xlsx_to_dict(filename, cachename): | |
db = {} | |
from openpyxl.reader.excel import load_workbook | |
wb=load_workbook(str(filename)) | |
for i, (ws, name) in enumerate(zip(wb.worksheets, wb.sheetnames)): | |
db[WorksheetKey(i, name)] = list(ws.values) | |
if cachename: | |
dump_cache(db, cachename) | |
return db | |
def load_cache(cachename): | |
with cachename.open('rb') as ifh: | |
return pickle.load(ifh) | |
def dump_cache(obj, cachename): | |
try: | |
exc_occurred = False | |
try: | |
with cachename.open('wb') as ofh: | |
pickle.dump(obj, ofh) | |
except Exception: | |
exc_occurred = True | |
raise | |
finally: | |
if exc_occurred: | |
try: | |
cachename.unlink() | |
except OSError: | |
pass | |
if __name__ == '__main__': | |
with tempfile.TemporaryDirectory() as tmp: | |
fn = pathlib.Path(tmp)/'tmp_example.csv' | |
with fn.open('w') as ofh: | |
print("""\ | |
Spam,maps,32 | |
Eggs,sgge,64 | |
Bacon,nocab,128""", file=ofh) | |
d = spreadsheet_to_dict(str(fn)) | |
print(d) |
@BlacksheepVMF-214 No, not at all. The data contained in the csv file (or ods or xls) does not need to be related to python, nor to this script. Create a simple spreadsheet with Excel or LibreOffice and try to load it in Python with this code. For each sheet in the workbook, a pair (key, value) will be created.
Just run the code and see what it produces.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for your prompt response, and its content.
My best guess as to how the csv file should look is this:
dict_name,,
,key1,def1
,key2,def2
,key3,def3
...
Is this correct?