makmanalp/data_store.py

## data_store.py
"""
Simple filesystem organization scheme. You have:

    - Objects: A logical "thing", e.g. a document or a page, with unique IDs
    - Keys: A type of data that we're storing about the object, like the
    location of margins on a page, or the locations of each text box.
    - Files: For a specific object under a specific key, you can have multiple
    files, e.g. image files for each column in the page

Generally you might want to store data in a specific object's key:

  get_path("text-locations", object_id="page-771").
  # which is the same as:
  get_path("text-locations", object_id="page-771", file_name="DATA").

If you have a single file you don't have to specify the name, it will just get
that. If you have multiple files, you can refer to them specifically:

  get_path("text-locations", object_id="page-771", file_name="word-locations.csv").
  get_path("text-locations", object_id="page-771", file_name="line-locations.csv").

You might sometimes want to have a key that stores all the data for every
object in one single file together for performance reasons, e.g. one big csv
file. We provide this as a convenience:

  get_path("page-data", object_id=None, file_name="all-word-locations.csv")
  get_path("page-data", object_id=None)
  # which is the same as
  get_path("page-data", None, "DATA")

While it seems more convenient, the reason you might not want this latter case
is e.g. concurrency: if you're running 30 parallel tasks at the same time,
(especially on a network drive) it's easier to write results to 30 different
files than to manage read / write consistency for 30 threads in one single file
without some complicated architecture - especially for long running tasks
generating large files, often on network file systems where locking can be
problematic. Another reason is that there's no generic way to enumerate the
objects in that key (with `list_objects_having_key`), as the internal format of
the file is up to you.

One task is more often going to want to access many objects relating to one
key, (e.g. the raw text for all documents) so we store them together, rather
than storing all the keys of one object together, e.g.:

store/
    - locks/
    - data/
        - key1/
            - obj1/
                - a.txt
                - b.txt
            - obj2/
                - a.txt
                - b.txt
        - key2/
            - obj2/
                - xyz.hdf
            - obj3/
                - xyz.hdf
        - key3/
            - all.hdf


This also makes e.g. renaming or deleting a whole analysis step very easy -
just rename or delete the key folder.

TODO:
    - Is it better to have to remember file names for each key, or to default
    to DATA?? pros / cons. If we don't have one, how much more annoying it?
    Then we also need an additional kwarg to handle getting directory path,
    which maybe is actually clearer. Having one makes it feel more like a k/v
    store though.

"""

import os


def get_path(key, object_id=None, file_name="DATA", store_path=None,
             makedirs=False):

    path = None

    if file_name is None:
        # Asking for a directory
        if object_id is None:
            # Must be asking for the path of a key
            path = os.path.join(store_path, key)
        else:
            # Must be asking for path of an object within a key
            path = os.path.join(store_path, key, object_id)

        if makedirs:
            os.makedirs(path, exist_ok=True)

        return path
    else:
        # Asking for a specific file
        if object_id is None:
            # Must be asking for a file in a key where each object has its own
            # folder
            path = os.path.join(store_path, key, file_name)
        else:
            # Special case: Must be asking for a file in a key where all
            # objects are stored together in a single file instead of
            # subdirectories.
            path = os.path.join(store_path, key, object_id, file_name)

        if makedirs:
            os.makedirs(os.path.dirname(path), exist_ok=True)

        return path


def list_objects_having_key(key, store_path=None):
    """Fetch all the objects that have this key, e.g. get all the object ids
    that have the style-data key."""

    key_path = get_path(key, file_name=None, store_path=store_path)

    if not os.exists(key_path):
        return None

    objects = []
    with os.scandir(key_path) as paths:
        for entry in paths:
            if entry.is_dir():
                objects.append(entry.name)

    return objects
	"""
	Simple filesystem organization scheme. You have:

	- Objects: A logical "thing", e.g. a document or a page, with unique IDs
	- Keys: A type of data that we're storing about the object, like the
	location of margins on a page, or the locations of each text box.
	- Files: For a specific object under a specific key, you can have multiple
	files, e.g. image files for each column in the page

	Generally you might want to store data in a specific object's key:

	get_path("text-locations", object_id="page-771").
	# which is the same as:
	get_path("text-locations", object_id="page-771", file_name="DATA").

	If you have a single file you don't have to specify the name, it will just get
	that. If you have multiple files, you can refer to them specifically:

	get_path("text-locations", object_id="page-771", file_name="word-locations.csv").
	get_path("text-locations", object_id="page-771", file_name="line-locations.csv").

	You might sometimes want to have a key that stores all the data for every
	object in one single file together for performance reasons, e.g. one big csv
	file. We provide this as a convenience:

	get_path("page-data", object_id=None, file_name="all-word-locations.csv")
	get_path("page-data", object_id=None)
	# which is the same as
	get_path("page-data", None, "DATA")

	While it seems more convenient, the reason you might not want this latter case
	is e.g. concurrency: if you're running 30 parallel tasks at the same time,
	(especially on a network drive) it's easier to write results to 30 different
	files than to manage read / write consistency for 30 threads in one single file
	without some complicated architecture - especially for long running tasks
	generating large files, often on network file systems where locking can be
	problematic. Another reason is that there's no generic way to enumerate the
	objects in that key (with `list_objects_having_key`), as the internal format of
	the file is up to you.

	One task is more often going to want to access many objects relating to one
	key, (e.g. the raw text for all documents) so we store them together, rather
	than storing all the keys of one object together, e.g.:

	store/
	- locks/
	- data/
	- key1/
	- obj1/
	- a.txt
	- b.txt
	- obj2/
	- a.txt
	- b.txt
	- key2/
	- obj2/
	- xyz.hdf
	- obj3/
	- xyz.hdf
	- key3/
	- all.hdf


	This also makes e.g. renaming or deleting a whole analysis step very easy -
	just rename or delete the key folder.

	TODO:
	- Is it better to have to remember file names for each key, or to default
	to DATA?? pros / cons. If we don't have one, how much more annoying it?
	Then we also need an additional kwarg to handle getting directory path,
	which maybe is actually clearer. Having one makes it feel more like a k/v
	store though.

	"""

	import os


	def get_path(key, object_id=None, file_name="DATA", store_path=None,
	makedirs=False):

	path = None

	if file_name is None:
	# Asking for a directory
	if object_id is None:
	# Must be asking for the path of a key
	path = os.path.join(store_path, key)
	else:
	# Must be asking for path of an object within a key
	path = os.path.join(store_path, key, object_id)

	if makedirs:
	os.makedirs(path, exist_ok=True)

	return path
	else:
	# Asking for a specific file
	if object_id is None:
	# Must be asking for a file in a key where each object has its own
	# folder
	path = os.path.join(store_path, key, file_name)
	else:
	# Special case: Must be asking for a file in a key where all
	# objects are stored together in a single file instead of
	# subdirectories.
	path = os.path.join(store_path, key, object_id, file_name)

	if makedirs:
	os.makedirs(os.path.dirname(path), exist_ok=True)

	return path


	def list_objects_having_key(key, store_path=None):
	"""Fetch all the objects that have this key, e.g. get all the object ids
	that have the style-data key."""

	key_path = get_path(key, file_name=None, store_path=store_path)

	if not os.exists(key_path):
	return None

	objects = []
	with os.scandir(key_path) as paths:
	for entry in paths:
	if entry.is_dir():
	objects.append(entry.name)

	return objects