leonhard-s/cmbl-to-xlsx.py

## cmbl-to-xlsx.py
"""A script for loading Logger Pro *.cmbl files into an Excel workbook.

Every matching file will be loaded into a separate Excel worksheet.

Note that this converter only works with text-based cmbl files, binary
files are not supported.
"""

import argparse
import array
import collections
import contextlib
import math
import os
import sys
from xml.etree import ElementTree as xml
from typing import Dict, Iterator, List, Tuple

import openpyxl
from typing_extensions import TypeAlias

# Type aliases
Array: TypeAlias = array.ArrayType
OrderedDict: TypeAlias = 'collections.OrderedDict'
DataSet: TypeAlias = 'OrderedDict[str, Array[float]]'


def _parse_cmbl(filepath: str, use_double: bool = False
                ) -> 'List[Tuple[str, DataSet]]':
    """Return a list of all datasets found in the file.

    The dataset display name, as well as the column names specified
    will be extracted and used as the keys of the returned dictionary
    and contained datasets respectively.

    The dataset arrays will always contain Python floats, but the
    precision used in memory may differ. By default, this only uses C
    floats to store the dataset values (i.e. 32 bits per value).

    Setting the `use_double` flag to True will instead use C doubles
    (i.e. 64 bits per value) as the internal data representation. This
    increases representation accuracy, but also doubles memory usage.

    Args:
        filepath (str): The path of the file to load.
        use_double (bool, optional): Whether to use C doubles for
            stored data. This is what Python's `float` type uses. Only
            to True if you require the extra precision, this doubles
            memory usage. Defaults to False.

    Returns:
        List[Tuple[str, DataSet]]: A list of dataset names and their
        corresponding datasets.
    """
    try:
        tree: xml.ElementTree = xml.parse(filepath)
    except xml.ParseError:
        print('Unable to parse file. Note that only text files are supported, '
              'the CMBL binary format is not')
        sys.exit(1)
    # Parse file root
    datasets: 'List[Tuple[str, DataSet]]' = []
    for l1_ele in tree.getroot():
        if l1_ele.tag != 'DataSet':
            continue
        # Parse dataset
        dataset_name = ''
        dataset_data: DataSet = collections.OrderedDict()
        for l2_ele in l1_ele:
            if l2_ele.tag == 'DataSetName':
                dataset_name = l2_ele.text.strip()
            if l2_ele.tag != 'DataColumn':
                continue
            # Parse dataset columns
            col_name = ''
            col_data: Array[float] = array.array('d' if use_double else 'f')
            for l3_ele in l2_ele:
                if l3_ele.tag == 'DataObjectName':
                    col_name = l3_ele.text.strip()
                if l3_ele.tag != 'ColumnCells':
                    continue
                # Populate array
                for line in l3_ele.text.splitlines():
                    if line.strip():
                        col_data.append(float(line))
            # No column name specified
            if not col_name:
                if not col_data:
                    continue
                col_name = f'Column #{len(dataset_data)+2}'
            dataset_data[col_name] = col_data
        # No dataset name specified
        if not dataset_name:
            if not dataset_data:
                continue
            dataset_name = f'Dataset #{len(datasets)+2}'
        datasets.append((dataset_name, dataset_data))
    return datasets


def _format_dataset_table(datasets: 'OrderedDict[str, DataSet]') -> List[str]:
    """Generate a list of fixed-width rows for the dataset table.

    This gets the name, column count, and row count for each dataset
    provided in the dictionary and returns a list of constant-width
    rows making up a table. The returned rows do not contain any
    leading or trailing whitespace.

    Args:
        datasets (OrderedDict[str, DataSet]): The datasets to list.

    Returns:
        List[str]: A list of lines to print. Does not include leading
        or trailing whitespace.
    """
    # Calculate the width of each column in the table
    max_name = max((len(n)+2 for n in datasets.keys()))  # +2 for quotes
    max_cols = max((len(d) for d in datasets.values()))
    max_rows = max((len(next(iter(d.values()))) if d else 0)
                   for _, d in datasets.items())
    cols_width = (int(math.log10(max_cols)) + 1) if max_cols > 0 else 1
    rows_width = (int(math.log10(max_rows)) + 1) if max_rows > 0 else 1
    # Populate the table
    table_rows: List[str] = []
    for name, dataset in datasets.items():
        num_rows = len(next(iter(dataset.values()))) if dataset else 0
        name = f'"{name}"'
        table_rows.append(f'{name:<{max_name}}: '
                          f'{len(dataset.values()):>{cols_width}} columns, '
                          f'{num_rows:>{rows_width}} rows')
    return table_rows


def main(in_file: str, out_file: str,
         overwrite: bool, use_double: bool) -> None:
    # Check input path
    if not os.path.exists(in_file):
        print(f'Invalid path: {in_file}')
        sys.exit(1)
    if not os.path.isfile(in_file):
        print(f'Path is not a file, use --dir/-d for directories: {in_file}')
        sys.exit(1)
    # Check output path
    if os.path.isfile(out_file):
        if not overwrite:
            print('The given output file already exists. Specify another name '
                  f'or set the --force/-f flag to overwrite it: {out_file}')
            sys.exit(1)
        try:
            os.remove(out_file)
        except FileNotFoundError:
            pass
    # Parse file
    print(f'Loading file \'{os.path.basename(in_file)}\'...')
    datasets: 'OrderedDict[str, DataSet]' = collections.OrderedDict(
        _parse_cmbl(in_file, use_double))
    # Display statistics
    dataset_table = _format_dataset_table(datasets)
    print(f'Datasets found: {len(datasets)}',
          *(f'  {d}' for d in dataset_table),
          sep='\n')
    # Write Excel file
    workbook = openpyxl.Workbook()
    print('In-memory workbook created')
    for index, items in enumerate(datasets.items()):
        name, dataset = items
        print(f'Populating worksheet {index+1} of {len(datasets)}...')
        # Create worksheet
        ws = workbook.active if index == 0 else workbook.create_sheet()
        ws.title = name
        # Write column headers
        ws.append(list(dataset.keys()))
        # Write rows
        iterators: List[Iterator[float]] = [iter(a) for a in dataset.values()]
        try:
            while True:
                row = [next(i) for i in iterators]
                ws.append(row)
        except StopIteration:
            pass
        index += 1
    print(f'Writing workbook: {os.path.basename(out_file)}')
    workbook.save(out_file)


if __name__ == '__main__':
    _parser = argparse.ArgumentParser()
    _parser.add_argument('path', help='Path to the file to convert')
    _parser.add_argument('--output', '-o', default='', help='Path of the '
                         'output worksheet, defaults to <input>.xlsx')
    _parser.add_argument('--force', '-f', action='store_true',
                         help='If set, the output will overwrite any existing '
                         'files of the same name')
    _parser.add_argument('--dir', '-d', action='store_true',
                         help='If set, the input path is expected to be a '
                         'directory, and all files from the directory are '
                         'merged into a single Excel workbook')
    _parser.add_argument('--doubles', '-D', action='store_true',
                         help='Whether to use doubles to store the parsed '
                         'values, this effectively doubles memory usage')
    _args = _parser.parse_args()
    _filename = _args.output or os.path.basename(_args.path)
    if not _filename:
        # Use folder name as fallback
        _filename = os.path.split(os.path.split(_args.path)[0])[1]
    if not _filename.lower().endswith('.xlsx'):
        _filename += '.xlsx'
    if _args.dir:
        _files = [os.path.join(_args.path, f)
                  for f in os.listdir(_args.path) if f.endswith('cmbl')]
        for _index, _file in enumerate(_files):
            print(f'Processing file {_index+1} of {len(_files)}...')
            _loop_filename = (
                f'{os.path.splitext(os.path.split(_file)[1])[0]}.xlsx')
            with contextlib.redirect_stdout(None):
                main(_file, _loop_filename, _args.force, _args.doubles)
    else:
        main(_args.path, _filename, _args.force, _args.doubles)
    print('done')
	"""A script for loading Logger Pro *.cmbl files into an Excel workbook.

	Every matching file will be loaded into a separate Excel worksheet.

	Note that this converter only works with text-based cmbl files, binary
	files are not supported.
	"""

	import argparse
	import array
	import collections
	import contextlib
	import math
	import os
	import sys
	from xml.etree import ElementTree as xml
	from typing import Dict, Iterator, List, Tuple

	import openpyxl
	from typing_extensions import TypeAlias

	# Type aliases
	Array: TypeAlias = array.ArrayType
	OrderedDict: TypeAlias = 'collections.OrderedDict'
	DataSet: TypeAlias = 'OrderedDict[str, Array[float]]'


	def _parse_cmbl(filepath: str, use_double: bool = False
	) -> 'List[Tuple[str, DataSet]]':
	"""Return a list of all datasets found in the file.

	The dataset display name, as well as the column names specified
	will be extracted and used as the keys of the returned dictionary
	and contained datasets respectively.

	The dataset arrays will always contain Python floats, but the
	precision used in memory may differ. By default, this only uses C
	floats to store the dataset values (i.e. 32 bits per value).

	Setting the `use_double` flag to True will instead use C doubles
	(i.e. 64 bits per value) as the internal data representation. This
	increases representation accuracy, but also doubles memory usage.

	Args:
	filepath (str): The path of the file to load.
	use_double (bool, optional): Whether to use C doubles for
	stored data. This is what Python's `float` type uses. Only
	to True if you require the extra precision, this doubles
	memory usage. Defaults to False.

	Returns:
	List[Tuple[str, DataSet]]: A list of dataset names and their
	corresponding datasets.
	"""
	try:
	tree: xml.ElementTree = xml.parse(filepath)
	except xml.ParseError:
	print('Unable to parse file. Note that only text files are supported, '
	'the CMBL binary format is not')
	sys.exit(1)
	# Parse file root
	datasets: 'List[Tuple[str, DataSet]]' = []
	for l1_ele in tree.getroot():
	if l1_ele.tag != 'DataSet':
	continue
	# Parse dataset
	dataset_name = ''
	dataset_data: DataSet = collections.OrderedDict()
	for l2_ele in l1_ele:
	if l2_ele.tag == 'DataSetName':
	dataset_name = l2_ele.text.strip()
	if l2_ele.tag != 'DataColumn':
	continue
	# Parse dataset columns
	col_name = ''
	col_data: Array[float] = array.array('d' if use_double else 'f')
	for l3_ele in l2_ele:
	if l3_ele.tag == 'DataObjectName':
	col_name = l3_ele.text.strip()
	if l3_ele.tag != 'ColumnCells':
	continue
	# Populate array
	for line in l3_ele.text.splitlines():
	if line.strip():
	col_data.append(float(line))
	# No column name specified
	if not col_name:
	if not col_data:
	continue
	col_name = f'Column #{len(dataset_data)+2}'
	dataset_data[col_name] = col_data
	# No dataset name specified
	if not dataset_name:
	if not dataset_data:
	continue
	dataset_name = f'Dataset #{len(datasets)+2}'
	datasets.append((dataset_name, dataset_data))
	return datasets


	def _format_dataset_table(datasets: 'OrderedDict[str, DataSet]') -> List[str]:
	"""Generate a list of fixed-width rows for the dataset table.

	This gets the name, column count, and row count for each dataset
	provided in the dictionary and returns a list of constant-width
	rows making up a table. The returned rows do not contain any
	leading or trailing whitespace.

	Args:
	datasets (OrderedDict[str, DataSet]): The datasets to list.

	Returns:
	List[str]: A list of lines to print. Does not include leading
	or trailing whitespace.
	"""
	# Calculate the width of each column in the table
	max_name = max((len(n)+2 for n in datasets.keys())) # +2 for quotes
	max_cols = max((len(d) for d in datasets.values()))
	max_rows = max((len(next(iter(d.values()))) if d else 0)
	for _, d in datasets.items())
	cols_width = (int(math.log10(max_cols)) + 1) if max_cols > 0 else 1
	rows_width = (int(math.log10(max_rows)) + 1) if max_rows > 0 else 1
	# Populate the table
	table_rows: List[str] = []
	for name, dataset in datasets.items():
	num_rows = len(next(iter(dataset.values()))) if dataset else 0
	name = f'"{name}"'
	table_rows.append(f'{name:<{max_name}}: '
	f'{len(dataset.values()):>{cols_width}} columns, '
	f'{num_rows:>{rows_width}} rows')
	return table_rows


	def main(in_file: str, out_file: str,
	overwrite: bool, use_double: bool) -> None:
	# Check input path
	if not os.path.exists(in_file):
	print(f'Invalid path: {in_file}')
	sys.exit(1)
	if not os.path.isfile(in_file):
	print(f'Path is not a file, use --dir/-d for directories: {in_file}')
	sys.exit(1)
	# Check output path
	if os.path.isfile(out_file):
	if not overwrite:
	print('The given output file already exists. Specify another name '
	f'or set the --force/-f flag to overwrite it: {out_file}')
	sys.exit(1)
	try:
	os.remove(out_file)
	except FileNotFoundError:
	pass
	# Parse file
	print(f'Loading file \'{os.path.basename(in_file)}\'...')
	datasets: 'OrderedDict[str, DataSet]' = collections.OrderedDict(
	_parse_cmbl(in_file, use_double))
	# Display statistics
	dataset_table = _format_dataset_table(datasets)
	print(f'Datasets found: {len(datasets)}',
	*(f' {d}' for d in dataset_table),
	sep='\n')
	# Write Excel file
	workbook = openpyxl.Workbook()
	print('In-memory workbook created')
	for index, items in enumerate(datasets.items()):
	name, dataset = items
	print(f'Populating worksheet {index+1} of {len(datasets)}...')
	# Create worksheet
	ws = workbook.active if index == 0 else workbook.create_sheet()
	ws.title = name
	# Write column headers
	ws.append(list(dataset.keys()))
	# Write rows
	iterators: List[Iterator[float]] = [iter(a) for a in dataset.values()]
	try:
	while True:
	row = [next(i) for i in iterators]
	ws.append(row)
	except StopIteration:
	pass
	index += 1
	print(f'Writing workbook: {os.path.basename(out_file)}')
	workbook.save(out_file)


	if __name__ == '__main__':
	_parser = argparse.ArgumentParser()
	_parser.add_argument('path', help='Path to the file to convert')
	_parser.add_argument('--output', '-o', default='', help='Path of the '
	'output worksheet, defaults to <input>.xlsx')
	_parser.add_argument('--force', '-f', action='store_true',
	help='If set, the output will overwrite any existing '
	'files of the same name')
	_parser.add_argument('--dir', '-d', action='store_true',
	help='If set, the input path is expected to be a '
	'directory, and all files from the directory are '
	'merged into a single Excel workbook')
	_parser.add_argument('--doubles', '-D', action='store_true',
	help='Whether to use doubles to store the parsed '
	'values, this effectively doubles memory usage')
	_args = _parser.parse_args()
	_filename = _args.output or os.path.basename(_args.path)
	if not _filename:
	# Use folder name as fallback
	_filename = os.path.split(os.path.split(_args.path)[0])[1]
	if not _filename.lower().endswith('.xlsx'):
	_filename += '.xlsx'
	if _args.dir:
	_files = [os.path.join(_args.path, f)
	for f in os.listdir(_args.path) if f.endswith('cmbl')]
	for _index, _file in enumerate(_files):
	print(f'Processing file {_index+1} of {len(_files)}...')
	_loop_filename = (
	f'{os.path.splitext(os.path.split(_file)[1])[0]}.xlsx')
	with contextlib.redirect_stdout(None):
	main(_file, _loop_filename, _args.force, _args.doubles)
	else:
	main(_args.path, _filename, _args.force, _args.doubles)
	print('done')