Skip to content

Instantly share code, notes, and snippets.

@vst
Last active March 22, 2020 12:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vst/269aceb5a54de0adf2cb23e3482895f8 to your computer and use it in GitHub Desktop.
Save vst/269aceb5a54de0adf2cb23e3482895f8 to your computer and use it in GitHub Desktop.
.xlsx utilities
"""
This module provides various functions to work with XLSX file.
Reference: https://gist.github.com/vst/269aceb5a54de0adf2cb23e3482895f8
"""
__all__ = ["read_workbook_data", "read_worksheet_data"]
from typing import Any, Dict, Iterable
from openpyxl import load_workbook # type: ignore
from openpyxl.cell.cell import Cell # type: ignore
from openpyxl.worksheet.worksheet import Worksheet # type: ignore
def read_workbook_data(filepath: str) -> Iterable[Dict[str, Any]]:
"""
Attempts to read data from the workbook at the given ``filepath`` as an iterable of :py:class:`dict` instances.
The data is read from the very first worksheet.
:param filepath: Path to the workbook.
:return: an iterable of :py:class:`dict` instances.
"""
## Load the workbook:
workbook = load_workbook(filepath, data_only=True, read_only=True, keep_links=False, keep_vba=False)
## Read the first worksheet and return:
return read_worksheet_data(workbook.worksheets[0])
def read_worksheet_data(worksheet: Worksheet) -> Iterable[Dict[str, Any]]:
"""
Attempts to read data from the given :py:class:`Worksheet` as an iterable of :py:class:`dict` instances.
:param sheet: :py:class:`Worksheet` instance to read data from.
:return: an iterable of :py:class:`dict` instances.
"""
## If we do not have any data rows, return right here:
if worksheet.max_row < 2:
return iter([])
## Get the rows iterator:
rows = worksheet.rows
## Get the header:
header = []
for colno, cell in enumerate(next(rows), start=1):
## Attempt to get colname:
colname = str(cell.value).strip() if cell.value else f"Col{colno}"
## If the colname already exists in the header, append stuff:
while colname in header:
colname = f"{colname}_1"
## Add to header:
header.append(colname)
## Iterate over data rows and yield data values:
for row in rows:
yield dict(zip(header, (_cast(i) for i in row)))
def _cast(x: Cell) -> Any:
"""
Returns the cell value by performing required casts.
In particular, we want to distinguish in between ``date`` and ``datetime``
object. It is a bit ugly, but works fine (so far).
"""
if x.value and x.is_date and "hh:" not in x.number_format.lower():
return x.value.date()
return x.value
if __name__ == "__main__":
import json
import sys
json.dump(list(read_workbook_data(sys.argv[1])), fp=sys.stdout, indent=2, default=str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment