Skip to content

Instantly share code, notes, and snippets.

@leonhard-s
Created March 27, 2021 20:22
Show Gist options
  • Save leonhard-s/50834b819e60468dc8927d5f39d517cf to your computer and use it in GitHub Desktop.
Save leonhard-s/50834b819e60468dc8927d5f39d517cf to your computer and use it in GitHub Desktop.
A conversion script to turn text mode Logger Pro CMBL files into Excel spreadsheets.
"""A script for loading Logger Pro *.cmbl files into an Excel workbook.
Every matching file will be loaded into a separate Excel worksheet.
Note that this converter only works with text-based cmbl files, binary
files are not supported.
"""
import argparse
import array
import collections
import contextlib
import math
import os
import sys
from xml.etree import ElementTree as xml
from typing import Dict, Iterator, List, Tuple
import openpyxl
from typing_extensions import TypeAlias
# Type aliases
Array: TypeAlias = array.ArrayType
OrderedDict: TypeAlias = 'collections.OrderedDict'
DataSet: TypeAlias = 'OrderedDict[str, Array[float]]'
def _parse_cmbl(filepath: str, use_double: bool = False
) -> 'List[Tuple[str, DataSet]]':
"""Return a list of all datasets found in the file.
The dataset display name, as well as the column names specified
will be extracted and used as the keys of the returned dictionary
and contained datasets respectively.
The dataset arrays will always contain Python floats, but the
precision used in memory may differ. By default, this only uses C
floats to store the dataset values (i.e. 32 bits per value).
Setting the `use_double` flag to True will instead use C doubles
(i.e. 64 bits per value) as the internal data representation. This
increases representation accuracy, but also doubles memory usage.
Args:
filepath (str): The path of the file to load.
use_double (bool, optional): Whether to use C doubles for
stored data. This is what Python's `float` type uses. Only
to True if you require the extra precision, this doubles
memory usage. Defaults to False.
Returns:
List[Tuple[str, DataSet]]: A list of dataset names and their
corresponding datasets.
"""
try:
tree: xml.ElementTree = xml.parse(filepath)
except xml.ParseError:
print('Unable to parse file. Note that only text files are supported, '
'the CMBL binary format is not')
sys.exit(1)
# Parse file root
datasets: 'List[Tuple[str, DataSet]]' = []
for l1_ele in tree.getroot():
if l1_ele.tag != 'DataSet':
continue
# Parse dataset
dataset_name = ''
dataset_data: DataSet = collections.OrderedDict()
for l2_ele in l1_ele:
if l2_ele.tag == 'DataSetName':
dataset_name = l2_ele.text.strip()
if l2_ele.tag != 'DataColumn':
continue
# Parse dataset columns
col_name = ''
col_data: Array[float] = array.array('d' if use_double else 'f')
for l3_ele in l2_ele:
if l3_ele.tag == 'DataObjectName':
col_name = l3_ele.text.strip()
if l3_ele.tag != 'ColumnCells':
continue
# Populate array
for line in l3_ele.text.splitlines():
if line.strip():
col_data.append(float(line))
# No column name specified
if not col_name:
if not col_data:
continue
col_name = f'Column #{len(dataset_data)+2}'
dataset_data[col_name] = col_data
# No dataset name specified
if not dataset_name:
if not dataset_data:
continue
dataset_name = f'Dataset #{len(datasets)+2}'
datasets.append((dataset_name, dataset_data))
return datasets
def _format_dataset_table(datasets: 'OrderedDict[str, DataSet]') -> List[str]:
"""Generate a list of fixed-width rows for the dataset table.
This gets the name, column count, and row count for each dataset
provided in the dictionary and returns a list of constant-width
rows making up a table. The returned rows do not contain any
leading or trailing whitespace.
Args:
datasets (OrderedDict[str, DataSet]): The datasets to list.
Returns:
List[str]: A list of lines to print. Does not include leading
or trailing whitespace.
"""
# Calculate the width of each column in the table
max_name = max((len(n)+2 for n in datasets.keys())) # +2 for quotes
max_cols = max((len(d) for d in datasets.values()))
max_rows = max((len(next(iter(d.values()))) if d else 0)
for _, d in datasets.items())
cols_width = (int(math.log10(max_cols)) + 1) if max_cols > 0 else 1
rows_width = (int(math.log10(max_rows)) + 1) if max_rows > 0 else 1
# Populate the table
table_rows: List[str] = []
for name, dataset in datasets.items():
num_rows = len(next(iter(dataset.values()))) if dataset else 0
name = f'"{name}"'
table_rows.append(f'{name:<{max_name}}: '
f'{len(dataset.values()):>{cols_width}} columns, '
f'{num_rows:>{rows_width}} rows')
return table_rows
def main(in_file: str, out_file: str,
overwrite: bool, use_double: bool) -> None:
# Check input path
if not os.path.exists(in_file):
print(f'Invalid path: {in_file}')
sys.exit(1)
if not os.path.isfile(in_file):
print(f'Path is not a file, use --dir/-d for directories: {in_file}')
sys.exit(1)
# Check output path
if os.path.isfile(out_file):
if not overwrite:
print('The given output file already exists. Specify another name '
f'or set the --force/-f flag to overwrite it: {out_file}')
sys.exit(1)
try:
os.remove(out_file)
except FileNotFoundError:
pass
# Parse file
print(f'Loading file \'{os.path.basename(in_file)}\'...')
datasets: 'OrderedDict[str, DataSet]' = collections.OrderedDict(
_parse_cmbl(in_file, use_double))
# Display statistics
dataset_table = _format_dataset_table(datasets)
print(f'Datasets found: {len(datasets)}',
*(f' {d}' for d in dataset_table),
sep='\n')
# Write Excel file
workbook = openpyxl.Workbook()
print('In-memory workbook created')
for index, items in enumerate(datasets.items()):
name, dataset = items
print(f'Populating worksheet {index+1} of {len(datasets)}...')
# Create worksheet
ws = workbook.active if index == 0 else workbook.create_sheet()
ws.title = name
# Write column headers
ws.append(list(dataset.keys()))
# Write rows
iterators: List[Iterator[float]] = [iter(a) for a in dataset.values()]
try:
while True:
row = [next(i) for i in iterators]
ws.append(row)
except StopIteration:
pass
index += 1
print(f'Writing workbook: {os.path.basename(out_file)}')
workbook.save(out_file)
if __name__ == '__main__':
_parser = argparse.ArgumentParser()
_parser.add_argument('path', help='Path to the file to convert')
_parser.add_argument('--output', '-o', default='', help='Path of the '
'output worksheet, defaults to <input>.xlsx')
_parser.add_argument('--force', '-f', action='store_true',
help='If set, the output will overwrite any existing '
'files of the same name')
_parser.add_argument('--dir', '-d', action='store_true',
help='If set, the input path is expected to be a '
'directory, and all files from the directory are '
'merged into a single Excel workbook')
_parser.add_argument('--doubles', '-D', action='store_true',
help='Whether to use doubles to store the parsed '
'values, this effectively doubles memory usage')
_args = _parser.parse_args()
_filename = _args.output or os.path.basename(_args.path)
if not _filename:
# Use folder name as fallback
_filename = os.path.split(os.path.split(_args.path)[0])[1]
if not _filename.lower().endswith('.xlsx'):
_filename += '.xlsx'
if _args.dir:
_files = [os.path.join(_args.path, f)
for f in os.listdir(_args.path) if f.endswith('cmbl')]
for _index, _file in enumerate(_files):
print(f'Processing file {_index+1} of {len(_files)}...')
_loop_filename = (
f'{os.path.splitext(os.path.split(_file)[1])[0]}.xlsx')
with contextlib.redirect_stdout(None):
main(_file, _loop_filename, _args.force, _args.doubles)
else:
main(_args.path, _filename, _args.force, _args.doubles)
print('done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment