Created
October 22, 2020 07:31
-
-
Save honno/baf7b8fe6e0bdd58b41ac792b4e7d5ea to your computer and use it in GitHub Desktop.
coinflip's old store module
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Store functionality for the CLI | |
Notes | |
----- | |
A store is an abstraction for a folder in the user's local data directory | |
which pertains to a specific dataset that comprises of RNG output. The store can | |
subsequently store test results and report markup for said results. | |
""" | |
import pickle | |
import shelve | |
from contextlib import contextmanager | |
from dataclasses import dataclass | |
from datetime import datetime | |
from os import scandir | |
from pathlib import Path | |
from time import sleep | |
from typing import Dict | |
from warnings import warn | |
import numpy as np | |
import pandas as pd | |
from appdirs import AppDirs | |
from slugify import slugify | |
from coinflip._randtests.common.exceptions import NonBinarySequenceError | |
from coinflip._randtests.common.result import TestResult | |
__all__ = [ | |
"TYPES", | |
"data_dir", | |
"DataParsingError", | |
"parse_data", | |
"StoreError", | |
"init_store", | |
"store_data", | |
"NoLatestStoreRecordedError", | |
"find_latest_store", | |
"get_data", | |
"drop", | |
"list_stores", | |
"store_result", | |
"store_results", | |
"open_results", | |
] | |
dirs = AppDirs(appname="coinflip", appauthor="MatthewBarber") | |
data_dir = Path(dirs.user_data_dir) | |
# Create local data directory if it does not already exist | |
try: | |
Path.mkdir(data_dir, parents=True) | |
except FileExistsError: | |
pass | |
LATEST_STORE_FNAME = "latest_store.txt" | |
DATA_FNAME = "series.pickle" | |
RESULTS_FNAME = "results" # shelve appends .db to filepaths | |
# ------------------------------------------------------------------------------ | |
# Store initialisation | |
TYPES = { | |
"bool": np.bool_, | |
"byte": np.byte, | |
"short": np.int16, | |
"int": np.int32, | |
"long": np.int64, | |
"float": np.float32, | |
"double": np.float64, | |
} | |
class DataParsingError(ValueError): | |
"""Base class for parsing-related errors""" | |
@dataclass | |
class TypeNotRecognizedError(DataParsingError): | |
"""Error for when a given dtype string representation is not recognised""" | |
dtype: str | |
def __str__(self): | |
f_types = ", ".join(TYPES.keys()) | |
return f"{self.dtype} is not a recognised data type\n" f"Valid types: {f_types}" | |
@dataclass | |
class MultipleColumnsError(DataParsingError): | |
"""Error for when only one column of data was expected""" | |
ncols: int | |
def __str__(self): | |
return ( | |
f"Parsed data contains {self.ncols} columns, but only 1 column was expected" | |
) | |
def parse_data(data_file, dtype_str=None) -> pd.Series: | |
"""Reads file containing data into a pandas Series | |
Reads from file containing RNG output and produces a representitive pandas | |
Series. The appropiate dtype is inferred from the data itself, or optionally | |
from the supplied ``dtype_str``. | |
Parameters | |
---------- | |
data_file : file-like object | |
File containing RNG output | |
dtype_str : ``str``, optional | |
String representation of desired dtype. If not supplied, it is inferred | |
from the data. | |
Returns | |
------- | |
``Series`` | |
A pandas ``Series`` which represents the data | |
Raises | |
------ | |
TypeNotRecognizedError | |
If supplied dtype_str does not recognise a dtype | |
MultipleColumnsError | |
If inputted data contains multiple values per line | |
NonBinarySequenceError | |
If sequence does not contain only 2 values | |
See Also | |
-------- | |
pandas.read_csv : The pandas method for reading ``data_file`` | |
store_data : Calls this method, and handles subsequent storage of data | |
""" | |
df = pd.read_csv(data_file, header=None) | |
ncols = len(df.columns) | |
if ncols > 1: | |
raise MultipleColumnsError(ncols) | |
series = df.iloc[:, 0] | |
if series.nunique() != 2: | |
raise NonBinarySequenceError() | |
if dtype_str is not None: | |
try: | |
dtype = TYPES[dtype_str] | |
except KeyError as e: | |
raise TypeNotRecognizedError(dtype_str) from e | |
series = series.astype(dtype) | |
else: | |
series = series.infer_objects() | |
return series | |
@dataclass | |
class StoreError(Exception): | |
"""Base class for store-related errors""" | |
store_name: str | |
class StoreExistsError(StoreError, FileExistsError): | |
"""Error for when a store is being assumed to not exist but does""" | |
def __str__(self): | |
return ( | |
f"'{self.store_name}' already exists\n" | |
"Use the --overwrite flag to write over this store" | |
) | |
class NameConflictError(StoreError, FileExistsError): | |
"""Error for when a unique storename could not be made""" | |
def __str__(self): | |
return f"Generated name '{self.store_name}' conflicted with existing store" | |
def init_store(name=None, overwrite=False): | |
"""Creates store in local data | |
A name supplied or generated is used to initialise a store. If supplied, | |
the name is sanitised to remove invalid characters for filepaths. If | |
generated, the name will be a timestamp of initialisation. | |
Parameters | |
---------- | |
name : ``str``, optional | |
Desired name of the store, which will be sanitised. If not supplied, a | |
name is generated automatically. | |
overwrite : ``boolean``, default ``False`` | |
If a name conflicts with an existing store, this decides whether to | |
overwrite it. | |
Returns | |
------- | |
store_name : ``str`` | |
Internal name of the initialised store | |
store_path : ``Path`` | |
Path of the initialised store | |
Raises | |
------ | |
NameConflictError | |
If attempts at generating a unique name fails | |
StoreExistsError | |
If a store of the same name exists already (and overwrite is set to | |
``False``) | |
NonBinarySequenceError | |
If sequence does not contain only 2 values | |
See Also | |
-------- | |
store_data : Parses data and calls this method, to then save data in store | |
""" | |
if name: | |
store_name = slugify(name, separator="_") | |
if store_name != name: | |
warn(f"Name encoded as {store_name}", UserWarning) | |
else: | |
for _ in range(3): | |
timestamp = datetime.now() | |
iso8601 = timestamp.strftime("%Y%m%dT%H%M%SZ") | |
store_name = f"store_{iso8601}" | |
if store_name not in list_stores(): | |
break | |
else: | |
sleep(1.5) | |
else: | |
raise NameConflictError(store_name) | |
print(f"Store name to be encoded as {store_name}") | |
store_path = data_dir / store_name | |
try: | |
Path.mkdir(store_path, parents=True) | |
except FileExistsError: | |
if overwrite: | |
rm_tree(store_path) | |
Path.mkdir(store_path) | |
else: | |
raise StoreExistsError(store_name) | |
return store_name, store_path | |
def store_data(data_file, name=None, dtype_str=None, overwrite=False): | |
"""Load and parse RNG output, serialised to a local data directory | |
Reads from file containing RNG output and produces a representitive pandas | |
Series. The appropiate dtype is inferred from the data itself, or optionally | |
from the supplied ``dtype_str``. | |
A name supplied or generated is used to initialise a store. If supplied, | |
the name is sanitised to remove invalid characters for filepaths. If | |
generated, the name will be a timestamp of initialisation. | |
The representive Series is serialised using Python's pickle module, saved | |
in the initialised store. | |
The store's name is also written to a file in the user data directory, to | |
be accessed later when identifying the last initialised store. | |
Parameters | |
---------- | |
data_file : file-like object | |
File containing RNG output | |
name : ``str``, optional | |
Desired name of the store, which will be sanitised. If not supplied, a | |
name is generated automatically. | |
dtype_str : ``str``, optional | |
String representation of desired dtype. If not supplied, it is inferred | |
from the data. | |
overwrite : ``bool``, default ``False`` | |
If a name conflicts with an existing store, this decides whether to | |
overwrite it. | |
Raises | |
------ | |
TypeNotRecognizedError | |
If supplied dtype_str does not recognise a dtype | |
MultipleColumnsError | |
If inputted data contains multiple values per line | |
NameConflictError | |
If attempts at generating a unique name fails | |
StoreExistsError | |
If a store of the same name exists already (and overwrite is set to | |
``False``) | |
See Also | |
-------- | |
parse_data : Loads and parses ``data_file`` | |
init_store : Initialises the store | |
find_latest_store : Accesses the name of the last initialised store | |
""" | |
series = parse_data(data_file, dtype_str) | |
store_name, store_path = init_store(name=name, overwrite=overwrite) | |
series = series.rename(store_name) | |
data_path = store_path / DATA_FNAME | |
pickle.dump(series, open(data_path, "wb")) | |
latest_store_path = data_dir / LATEST_STORE_FNAME | |
with open(latest_store_path, "w") as f: | |
f.write(store_name) | |
print("Data stored successfully!") | |
# ------------------------------------------------------------------------------ | |
# Store interaction | |
class NoLatestStoreRecordedError(LookupError): | |
"""Error for when latest store cannot be identified""" | |
def __str__(self): | |
return "No record of the last initialised store was found" | |
def find_latest_store() -> str: | |
"""Find out the last initialised store | |
A file is kept in the root user data directory to record the last | |
initialised store's name, which this method reads to identify the store. | |
Returns | |
------- | |
store_name : ``str`` | |
Name of the last initialised store | |
Raises | |
------ | |
NoLatestStoreRecordedError | |
When no last initialised store is found | |
""" | |
latest_store_path = data_dir / LATEST_STORE_FNAME | |
try: | |
with open(latest_store_path) as f: | |
store_name = f.readlines()[0] | |
if store_name in list_stores(): | |
return store_name | |
except FileNotFoundError: | |
pass | |
raise NoLatestStoreRecordedError() | |
class StoreNotFoundError(StoreError, FileNotFoundError): | |
"""Error for when requested store does not exist""" | |
def __str__(self): | |
return f"'{self.store_name}' does not exist" | |
class DataNotFoundError(StoreError, FileNotFoundError): | |
"""Error for when requested store has no data""" | |
def __str__(self): | |
return f"'{self.store_name}' contains no data" | |
def get_data(store_name) -> pd.Series: | |
"""Access data of a store | |
Parameters | |
---------- | |
store_name : ``str`` | |
Name of the store | |
Returns | |
------- | |
``Series`` | |
A pandas ``Series`` which represents the data | |
Raises | |
------ | |
StoreNotFoundError | |
If requested store does not exist | |
DataNotFoundError | |
If requested store has no data | |
""" | |
store_path = data_dir / store_name | |
if not store_path.exists(): | |
raise StoreNotFoundError(store_name) | |
data_path = store_path / DATA_FNAME | |
try: | |
with open(data_path, "rb") as f: | |
series = pickle.load(f) | |
return series | |
except FileNotFoundError as e: | |
raise DataNotFoundError(store_name) from e | |
def drop(store_name): | |
"""Remove store from local data | |
Parameters | |
---------- | |
store_name : ``str`` | |
Name of store to remove | |
""" | |
store_path = data_dir / store_name | |
rm_tree(store_path) | |
def list_stores(): | |
"""List all stores in local data""" | |
try: | |
for entry in scandir(data_dir): | |
if entry.is_dir(): | |
yield entry.name | |
except FileNotFoundError: | |
pass | |
def store_result(store_name, randtest_name, result: TestResult): | |
"""Store result of a statistical test | |
Parameters | |
---------- | |
store_name : ``str`` | |
Name of store to save result in | |
randtest_name : ``str`` | |
Name of statistical test the result came from | |
result : ``TestResult`` | |
Result of the statistical test | |
See Also | |
-------- | |
store_results : Store multiple results from multiple statistical tests | |
""" | |
with open_results(store_name) as results: | |
results[randtest_name] = result | |
# TODO logging or warning for overwritten results | |
def store_results(store_name, results_dict: Dict[str, TestResult]): | |
"""Store results of multiple statistical tests | |
Parameters | |
---------- | |
store_name : ``str`` | |
Name of store to save result in | |
results_dict : ``Dict[str, TestResult]`` | |
Mapping of statistical tests to their respective results | |
See Also | |
-------- | |
store_result : Store a single results from a single statistical test | |
""" | |
with open_results(store_name) as results: | |
for randtest_name, result in results_dict.items(): | |
results[randtest_name] = result | |
@contextmanager | |
def open_results(store_name): | |
"""Context manager to read/write results of a store | |
Parameters | |
---------- | |
store_name : ``str`` | |
Name of store to access results in | |
Yields | |
------ | |
results : ``Dict[str, TestResult]`` | |
Previously stored results of statistical tests | |
Raises | |
------ | |
StoreNotFoundError | |
If requested store does not exist | |
""" | |
store_path = data_dir / store_name | |
if not store_path.exists(): | |
raise StoreNotFoundError() | |
results_path = store_path / RESULTS_FNAME | |
with open_shelve(results_path) as results: | |
yield results | |
# ------------------------------------------------------------------------------ | |
# Helpers | |
def rm_tree(path: Path): | |
"""Recursively remove files and folders in a given directory""" | |
for child in path.glob("*"): | |
if child.is_file(): | |
child.unlink() | |
else: | |
rm_tree(child) | |
path.rmdir() | |
def open_shelve(path): | |
"""Adaptor of shelve.open to work with pathlib's Path""" | |
path_str = str(path) | |
return shelve.open(path_str) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment