Gribouillis/spreadsheet_to_dict.py

## spreadsheet_to_dict.py
from collections import namedtuple
import pathlib
import pickle
import subprocess as sp
import tempfile

__version__ = '2019.01.16.2'
__all__ = ['spreadsheet_to_dict',]

WorksheetKey = namedtuple('WorksheetKey', "index name")

def spreadsheet_to_dict(filename, cachename=None):
    """Return a dict: {(index, name): <list of tuples>}

    Arguments
        filename: a path to a .ods or .xlsx file. It may also work
                with .csv and .xls and others

        cachename: if not None, a path to a pickle file where the
                python dictionary will be stored. If this file exists
                and its modification time is more recent than that of
                the spreadsheet file, it is read directly instead of
                the spreadsheet.

    Worksheet keys are namedtuples with members .index and .name

    see also: libreoffice, openpyxl
    """
    filename = pathlib.Path(filename)
    if cachename:
        cachename = pathlib.Path(cachename)
        if cachename.is_file() and (
            cachename.stat().st_mtime > filename.stat().st_mtime):
            return load_cache(cachename)
    if filename.suffix == '.xlsx':
        return _xlsx_to_dict(filename, cachename)
    with tempfile.TemporaryDirectory() as tmpd:
        tmp = pathlib.Path(tmpd)
        userdir = tmp/'.user'
        userdir.mkdir()
        convert_to_xlsx = [
            'soffice',
            # avoid possible interaction with running instance of libreoffice
            '-env:UserInstallation={}'.format(userdir.as_uri()),
            '--headless',
            '--convert-to',
            # 'xlsx:"Calc MS Excel 2007 XML"', # <- doesn't work
            # 'xlsx', # <- works
            'xlsx:Calc MS Excel 2007 XML', # <- also works
            '--outdir',
            str(tmp),
            str(filename)]
        sp.run(convert_to_xlsx, check=True, stdout=sp.DEVNULL)
        return _xlsx_to_dict(
            (tmp/filename.name).with_suffix('.xlsx'), cachename)

def _xlsx_to_dict(filename, cachename):
    db = {}
    from openpyxl.reader.excel import load_workbook
    wb=load_workbook(str(filename))
    for i, (ws, name) in enumerate(zip(wb.worksheets, wb.sheetnames)):
        db[WorksheetKey(i, name)] = list(ws.values)
    if cachename:
        dump_cache(db, cachename)
    return db

def load_cache(cachename):
    with cachename.open('rb') as ifh:
        return pickle.load(ifh)

def dump_cache(obj, cachename):
    try:
        exc_occurred = False
        try:
            with cachename.open('wb') as ofh:
                pickle.dump(obj, ofh)
        except Exception:
            exc_occurred = True
            raise
    finally:
        if exc_occurred:
            try:
                cachename.unlink()
            except OSError:
                pass


if __name__ == '__main__':
    with tempfile.TemporaryDirectory() as tmp:
        fn = pathlib.Path(tmp)/'tmp_example.csv'
        with fn.open('w') as ofh:
            print("""\
Spam,maps,32
Eggs,sgge,64
Bacon,nocab,128""", file=ofh)
        d = spreadsheet_to_dict(str(fn))
    print(d)
	from collections import namedtuple
	import pathlib
	import pickle
	import subprocess as sp
	import tempfile

	__version__ = '2019.01.16.2'
	__all__ = ['spreadsheet_to_dict',]

	WorksheetKey = namedtuple('WorksheetKey', "index name")

	def spreadsheet_to_dict(filename, cachename=None):
	"""Return a dict: {(index, name): <list of tuples>}

	Arguments
	filename: a path to a .ods or .xlsx file. It may also work
	with .csv and .xls and others

	cachename: if not None, a path to a pickle file where the
	python dictionary will be stored. If this file exists
	and its modification time is more recent than that of
	the spreadsheet file, it is read directly instead of
	the spreadsheet.

	Worksheet keys are namedtuples with members .index and .name

	see also: libreoffice, openpyxl
	"""
	filename = pathlib.Path(filename)
	if cachename:
	cachename = pathlib.Path(cachename)
	if cachename.is_file() and (
	cachename.stat().st_mtime > filename.stat().st_mtime):
	return load_cache(cachename)
	if filename.suffix == '.xlsx':
	return _xlsx_to_dict(filename, cachename)
	with tempfile.TemporaryDirectory() as tmpd:
	tmp = pathlib.Path(tmpd)
	userdir = tmp/'.user'
	userdir.mkdir()
	convert_to_xlsx = [
	'soffice',
	# avoid possible interaction with running instance of libreoffice
	'-env:UserInstallation={}'.format(userdir.as_uri()),
	'--headless',
	'--convert-to',
	# 'xlsx:"Calc MS Excel 2007 XML"', # <- doesn't work
	# 'xlsx', # <- works
	'xlsx:Calc MS Excel 2007 XML', # <- also works
	'--outdir',
	str(tmp),
	str(filename)]
	sp.run(convert_to_xlsx, check=True, stdout=sp.DEVNULL)
	return _xlsx_to_dict(
	(tmp/filename.name).with_suffix('.xlsx'), cachename)

	def _xlsx_to_dict(filename, cachename):
	db = {}
	from openpyxl.reader.excel import load_workbook
	wb=load_workbook(str(filename))
	for i, (ws, name) in enumerate(zip(wb.worksheets, wb.sheetnames)):
	db[WorksheetKey(i, name)] = list(ws.values)
	if cachename:
	dump_cache(db, cachename)
	return db

	def load_cache(cachename):
	with cachename.open('rb') as ifh:
	return pickle.load(ifh)

	def dump_cache(obj, cachename):
	try:
	exc_occurred = False
	try:
	with cachename.open('wb') as ofh:
	pickle.dump(obj, ofh)
	except Exception:
	exc_occurred = True
	raise
	finally:
	if exc_occurred:
	try:
	cachename.unlink()
	except OSError:
	pass


	if __name__ == '__main__':
	with tempfile.TemporaryDirectory() as tmp:
	fn = pathlib.Path(tmp)/'tmp_example.csv'
	with fn.open('w') as ofh:
	print("""\
	Spam,maps,32
	Eggs,sgge,64
	Bacon,nocab,128""", file=ofh)
	d = spreadsheet_to_dict(str(fn))
	print(d)