xflr6/bob_ross.py

## bob_ross.py
"""Benchmark FCA concepts lattice generation with bob_ross.csv."""

from collections.abc import Iterable, Iterator, Sequence
import csv
import os
import pathlib
import time
from typing import NamedTuple, Optional
import urllib.request

import concepts

URL = ('https://raw.githubusercontent.com/fivethirtyeight/data'
       '/master/bob-ross/elements-by-episode.csv')

CSV = pathlib.Path(URL.rpartition('/')[2])

OPEN_KWARGS = {'encoding': 'ascii', 'newline': '\n'}

CSV_CONTEXT = CSV.with_name(f'{CSV.stem}-cxt{CSV.suffix}')

CXT_CONTEXT = CSV.with_suffix('.cxt')


def read_episodes(path: os.PathLike | str, *,
                  dialect: csv.Dialect | type[csv.Dialect] | str = csv.excel):
    flags = {'0': False, '1': True}
    with open(path, **OPEN_KWARGS) as f:
        reader = csv.reader(f, dialect=dialect)
        header = next(reader)
        fields = dict.fromkeys(header[:2], str) | dict.fromkeys(header[2:], bool)
        make_episode = NamedTuple('Episode', fields.items())._make
        for episode, title, *elements in reader:
            yield make_episode([episode, title] + [flags[e] for e in elements])


def write_csv(path: os.PathLike | str, rows, *,
              header: Optional[Sequence[str]] = None,
              dialect: csv.Dialect | type[csv.Dialect] | str = csv.excel) -> None:
    with open(path, 'w', **OPEN_KWARGS) as f:
        writer = csv.writer(f, dialect=dialect)
        if header is not None:
            writer.writerow(header)
        writer.writerows(rows)


def iter_cxt_lines(objects: Sequence[str],
                   attributes: Sequence[str],
                   bools: Sequence[Sequence[bool]]) -> Iterator[str]:
    assert len(objects) == len(bools)
    assert {len(attributes)} == set(map(len, bools))

    yield 'B'
    yield ''
    yield f'{len(objects):d}'
    yield f'{len(attributes):d}'
    yield ''

    yield from objects
    yield from attributes

    flags = {False: '.', True: 'X'}
    for row in bools:
        yield ''.join(flags[value] for value in row)


def write_lines(path: os.PathLike | str, lines: Iterable[str]) -> None:
    with path.open('w', **OPEN_KWARGS) as f:
        for line in lines:
            print(line, file=f)


if not CSV.exists():
    urllib.request.urlretrieve(URL, CSV)
    assert CSV.stat().st_size

if not all(path.exists() for path in (CSV_CONTEXT, CXT_CONTEXT)):
    episodes = list(read_episodes(CSV))
    header = list(episodes[0]._fields)
    header.pop(1)  # omit TITLE column

    flags = {False: '', True: 'X'}
    write_csv(CSV_CONTEXT,
              ([episode] + [flags[b] for b in bools]
               for episode, _, *bools in episodes),
              header=header)

    lines = iter_cxt_lines(objects=[e[0] for e in episodes],
                           attributes=header[1:],
                           bools=[bools for _, _, *bools in episodes])
    write_lines(CXT_CONTEXT, lines)

start = time.perf_counter_ns()

context = concepts.load_cxt(CXT_CONTEXT)

assert len(context.objects) == 403
assert len(context.properties) == 67

lattice = context.lattice

assert len(lattice) == 3_463

duration = (time.perf_counter_ns() - start) / 1_000_000_000
print(duration)

# concepts 0.9.2, 2.2 GHz Intel i3-2330M CPU, 4GB RAM: 189s (PY2), 132s (PY3)
# concepts 0.10.dev0, 2.2 GHz Intel i3-2330M CPU, 4GB RAM: 32s
	"""Benchmark FCA concepts lattice generation with bob_ross.csv."""

	from collections.abc import Iterable, Iterator, Sequence
	import csv
	import os
	import pathlib
	import time
	from typing import NamedTuple, Optional
	import urllib.request

	import concepts

	URL = ('https://raw.githubusercontent.com/fivethirtyeight/data'
	'/master/bob-ross/elements-by-episode.csv')

	CSV = pathlib.Path(URL.rpartition('/')[2])

	OPEN_KWARGS = {'encoding': 'ascii', 'newline': '\n'}

	CSV_CONTEXT = CSV.with_name(f'{CSV.stem}-cxt{CSV.suffix}')

	CXT_CONTEXT = CSV.with_suffix('.cxt')


	def read_episodes(path: os.PathLike \| str, *,
	dialect: csv.Dialect \| type[csv.Dialect] \| str = csv.excel):
	flags = {'0': False, '1': True}
	with open(path, **OPEN_KWARGS) as f:
	reader = csv.reader(f, dialect=dialect)
	header = next(reader)
	fields = dict.fromkeys(header[:2], str) \| dict.fromkeys(header[2:], bool)
	make_episode = NamedTuple('Episode', fields.items())._make
	for episode, title, *elements in reader:
	yield make_episode([episode, title] + [flags[e] for e in elements])


	def write_csv(path: os.PathLike \| str, rows, *,
	header: Optional[Sequence[str]] = None,
	dialect: csv.Dialect \| type[csv.Dialect] \| str = csv.excel) -> None:
	with open(path, 'w', **OPEN_KWARGS) as f:
	writer = csv.writer(f, dialect=dialect)
	if header is not None:
	writer.writerow(header)
	writer.writerows(rows)


	def iter_cxt_lines(objects: Sequence[str],
	attributes: Sequence[str],
	bools: Sequence[Sequence[bool]]) -> Iterator[str]:
	assert len(objects) == len(bools)
	assert {len(attributes)} == set(map(len, bools))

	yield 'B'
	yield ''
	yield f'{len(objects):d}'
	yield f'{len(attributes):d}'
	yield ''

	yield from objects
	yield from attributes

	flags = {False: '.', True: 'X'}
	for row in bools:
	yield ''.join(flags[value] for value in row)


	def write_lines(path: os.PathLike \| str, lines: Iterable[str]) -> None:
	with path.open('w', **OPEN_KWARGS) as f:
	for line in lines:
	print(line, file=f)


	if not CSV.exists():
	urllib.request.urlretrieve(URL, CSV)
	assert CSV.stat().st_size

	if not all(path.exists() for path in (CSV_CONTEXT, CXT_CONTEXT)):
	episodes = list(read_episodes(CSV))
	header = list(episodes[0]._fields)
	header.pop(1) # omit TITLE column

	flags = {False: '', True: 'X'}
	write_csv(CSV_CONTEXT,
	([episode] + [flags[b] for b in bools]
	for episode, _, *bools in episodes),
	header=header)

	lines = iter_cxt_lines(objects=[e[0] for e in episodes],
	attributes=header[1:],
	bools=[bools for _, _, *bools in episodes])
	write_lines(CXT_CONTEXT, lines)

	start = time.perf_counter_ns()

	context = concepts.load_cxt(CXT_CONTEXT)

	assert len(context.objects) == 403
	assert len(context.properties) == 67

	lattice = context.lattice

	assert len(lattice) == 3_463

	duration = (time.perf_counter_ns() - start) / 1_000_000_000
	print(duration)

	# concepts 0.9.2, 2.2 GHz Intel i3-2330M CPU, 4GB RAM: 189s (PY2), 132s (PY3)
	# concepts 0.10.dev0, 2.2 GHz Intel i3-2330M CPU, 4GB RAM: 32s