-
-
Save Gribouillis/e4360ea3b0dbd0e37f4c13fa276adc8e to your computer and use it in GitHub Desktop.
from collections import namedtuple | |
import pathlib | |
import pickle | |
import subprocess as sp | |
import tempfile | |
__version__ = '2019.01.16.2' | |
__all__ = ['spreadsheet_to_dict',] | |
WorksheetKey = namedtuple('WorksheetKey', "index name") | |
def spreadsheet_to_dict(filename, cachename=None): | |
"""Return a dict: {(index, name): <list of tuples>} | |
Arguments | |
filename: a path to a .ods or .xlsx file. It may also work | |
with .csv and .xls and others | |
cachename: if not None, a path to a pickle file where the | |
python dictionary will be stored. If this file exists | |
and its modification time is more recent than that of | |
the spreadsheet file, it is read directly instead of | |
the spreadsheet. | |
Worksheet keys are namedtuples with members .index and .name | |
see also: libreoffice, openpyxl | |
""" | |
filename = pathlib.Path(filename) | |
if cachename: | |
cachename = pathlib.Path(cachename) | |
if cachename.is_file() and ( | |
cachename.stat().st_mtime > filename.stat().st_mtime): | |
return load_cache(cachename) | |
if filename.suffix == '.xlsx': | |
return _xlsx_to_dict(filename, cachename) | |
with tempfile.TemporaryDirectory() as tmpd: | |
tmp = pathlib.Path(tmpd) | |
userdir = tmp/'.user' | |
userdir.mkdir() | |
convert_to_xlsx = [ | |
'soffice', | |
# avoid possible interaction with running instance of libreoffice | |
'-env:UserInstallation={}'.format(userdir.as_uri()), | |
'--headless', | |
'--convert-to', | |
# 'xlsx:"Calc MS Excel 2007 XML"', # <- doesn't work | |
# 'xlsx', # <- works | |
'xlsx:Calc MS Excel 2007 XML', # <- also works | |
'--outdir', | |
str(tmp), | |
str(filename)] | |
sp.run(convert_to_xlsx, check=True, stdout=sp.DEVNULL) | |
return _xlsx_to_dict( | |
(tmp/filename.name).with_suffix('.xlsx'), cachename) | |
def _xlsx_to_dict(filename, cachename): | |
db = {} | |
from openpyxl.reader.excel import load_workbook | |
wb=load_workbook(str(filename)) | |
for i, (ws, name) in enumerate(zip(wb.worksheets, wb.sheetnames)): | |
db[WorksheetKey(i, name)] = list(ws.values) | |
if cachename: | |
dump_cache(db, cachename) | |
return db | |
def load_cache(cachename): | |
with cachename.open('rb') as ifh: | |
return pickle.load(ifh) | |
def dump_cache(obj, cachename): | |
try: | |
exc_occurred = False | |
try: | |
with cachename.open('wb') as ofh: | |
pickle.dump(obj, ofh) | |
except Exception: | |
exc_occurred = True | |
raise | |
finally: | |
if exc_occurred: | |
try: | |
cachename.unlink() | |
except OSError: | |
pass | |
if __name__ == '__main__': | |
with tempfile.TemporaryDirectory() as tmp: | |
fn = pathlib.Path(tmp)/'tmp_example.csv' | |
with fn.open('w') as ofh: | |
print("""\ | |
Spam,maps,32 | |
Eggs,sgge,64 | |
Bacon,nocab,128""", file=ofh) | |
d = spreadsheet_to_dict(str(fn)) | |
print(d) |
@BlacksheepVMF-214 Hi, thank you for your interest in this snippet. Usage is quite simple, save it on the python path, then in a program write for example
from spreadsheet_to_dict import spreadsheet_to_dict
datadict = spreadsheet_to_dict('myfile.ods', cachename='mycache.pkl')
This call returns a Python dictionary with the contents obtained from the spreadsheet. Using a cache file is optional, but if you run your program several times and the spreadsheet was not modified between two runs, then it will be loaded very fast after the first run because a pickled version of the dictionary is stored in the cache. If the spreadsheet is changed on disk, it will be read again.
As for which spreadsheet this can load, well, the main tool is to call libreoffice to convert the spreadsheet to xlsx, so let's say it is every file for which this conversion is successful.
This code uses the 'soffice' command, but with today's version of the 'libreoffice' command, the call could probably be simplified.
Thank you for your prompt response, and its content.
My best guess as to how the csv file should look is this:
dict_name,,
,key1,def1
,key2,def2
,key3,def3
...
Is this correct?
@BlacksheepVMF-214 No, not at all. The data contained in the csv file (or ods or xls) does not need to be related to python, nor to this script. Create a simple spreadsheet with Excel or LibreOffice and try to load it in Python with this code. For each sheet in the workbook, a pair (key, value) will be created.
Just run the code and see what it produces.
I'm new to GitHub, so if I'm not doing things correctly, please let me know...
Can you give an example of (1) what the spreadsheet/csv looks like, and (2) an example of how to use this file?
Thanks.