jlmelville/read-and-pickle.py

## read-and-pickle.py
import os
import pickle
from pathlib import Path


def read_data(dataset, suffix=None, dir=None, repickle=False):
    """Set `repickle=True` to rewrite pickle file if CSV has changed"""
    home = str(Path.home())
    if dir is None:
        dataset_path = os.path.join(home, "dev", "datasets")
    else:
        dataset_path = dir
    if suffix is not None:
        dataset_basename = f"{dataset}-{suffix}"
    else:
        dataset_basename = dataset
    pickle_name = f"{dataset_basename}.pickle"
    pickle_path = os.path.join(dataset_path, pickle_name)

    if os.path.exists(pickle_path) and not repickle:
        with open(pickle_path, "rb") as f:
            return pickle.load(f)
    else:
        csv_name = f"{dataset_basename}.csv"
        csv_path = os.path.join(dataset_path, csv_name)
        if not os.path.exists(csv_path):
            raise FileNotFoundError(csv_path)
        data = pd.read_csv(csv_path)
        with open(pickle_path, "wb") as f:
            pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        return data


def read_dataxy(dataset, repickle=False):
    """Read X and Y (Label) data from separate files"""
    x = read_data(dataset, "x", repickle=repickle)
    try:
        y = read_data(dataset, "y", repickle=repickle)
    except FileNotFoundError:
        y = range(x.shape[0])
    return (x, y)
	import os
	import pickle
	from pathlib import Path


	def read_data(dataset, suffix=None, dir=None, repickle=False):
	"""Set `repickle=True` to rewrite pickle file if CSV has changed"""
	home = str(Path.home())
	if dir is None:
	dataset_path = os.path.join(home, "dev", "datasets")
	else:
	dataset_path = dir
	if suffix is not None:
	dataset_basename = f"{dataset}-{suffix}"
	else:
	dataset_basename = dataset
	pickle_name = f"{dataset_basename}.pickle"
	pickle_path = os.path.join(dataset_path, pickle_name)

	if os.path.exists(pickle_path) and not repickle:
	with open(pickle_path, "rb") as f:
	return pickle.load(f)
	else:
	csv_name = f"{dataset_basename}.csv"
	csv_path = os.path.join(dataset_path, csv_name)
	if not os.path.exists(csv_path):
	raise FileNotFoundError(csv_path)
	data = pd.read_csv(csv_path)
	with open(pickle_path, "wb") as f:
	pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
	return data


	def read_dataxy(dataset, repickle=False):
	"""Read X and Y (Label) data from separate files"""
	x = read_data(dataset, "x", repickle=repickle)
	try:
	y = read_data(dataset, "y", repickle=repickle)
	except FileNotFoundError:
	y = range(x.shape[0])
	return (x, y)