Skip to content

Instantly share code, notes, and snippets.

@yatharthb97
Last active April 12, 2023 23:05
Show Gist options
  • Save yatharthb97/1110a1924d8d70ef8ad444f12a952de6 to your computer and use it in GitHub Desktop.
Save yatharthb97/1110a1924d8d70ef8ad444f12a952de6 to your computer and use it in GitHub Desktop.
Object that is specialized to collect, store, and export metadata for any generic application.
#!/usr/bin/env python3
#!/usr/bin/env python
"""
Metadata aggregator
Object that is specialized to collect, store, and export metadata for any generic application.
Author : *Yatharth Bhasin* (Github → yatharthb97)
License: *MIT open-source license* (https://opensource.org/licenses/mit-license.php)
This piece of software was released on GistHub : TODO
"""
def DescriptorGenerator():
"""
Generator function that sequentially returns
`keys` for metadatum if none is given by the user.
"""
i = 0
while True:
i = i + 1
yield 'descriptor-' + str(i)
class MetaData:
default_header = "| meta-data collector >" # Default header
default_no_input_msg = "Okay! Keep your secrets!" # Default no input message
def __init__(self):
"""
Constructor.
"""
self.metadata = {} # Stores metadata
self.datetime = {} # Stores datetime metadata
self.questions = [] # List of extra questions to be asked to the user
self.include_weekday = True # Include day explicitly in metadata construct
self.fd_restrict = None # Use finite number of meta-date entities for generation of file descriptor
self.collected = False #Indicates whether the collection cycle is complete
self.header = self.default_header # Header to be used with the
self.no_input_msg = self.default_no_input_msg # Message when no input is given
self.no_input_str = f"{self.header} {self.no_input_msg}" # Header + No input message
self.descriptor = DescriptorGenerator() # Instance of descriptor generator
def set_header(self, hstring="", name=None):
"""
Changes the header string to passed `hstring`.
"""
if hstring == "":
hstring = self.header.replace("| ", "")
hstring = hstring.replace(" >", "")
suffix = ""
if name != None:
suffix = f"{name}'s "
self.header = f"| {suffix}{hstring} >"
self.no_input_str = f"{self.header} {self.no_input_msg}"
def collect(self, user_interactive=True, username_header=False):
"""
Collects the metadata in a user-nteractive Q & A.
user_interactive (optional) : Interact with user to collect information (blocking).
username_header (optional) : Change the header to include the user's name.
"""
# Collect date and time
import datetime
dt = datetime.datetime.now()
self.datetime["year"] = dt.year
self.datetime["month"] = dt.month
self.datetime["day"] = dt.day
self.datetime["hour"] = dt.hour
self.datetime["min"] = dt.minute
self.datetime["weekday"] = dt.strftime('%A')
self.datetime["rep"] = str(dt)
self.metadata["date"] = f"{self.datetime['day']}-{self.datetime['month']}-{self.datetime['year']}"
if self.include_weekday:
self.metadata["weekday"] = self.datetime["weekday"][:3]
if user_interactive:
# Collect user name
user = input(f"{self.header} What is your name? → ")
if user != "":
if username_header:
self.set_header(name=user)
print(f"{self.header} Hello! {user}.")
self.metadata['user'] = user
else:
print(self.no_input_str)
# Ask pre-decided Q & A
for que in self.questions:
ans = input(f"{self.header} {que} → ")
if ans != "":
print(f"{self.header} recorded: {ans}")
self.metadata[que] = ans
else:
print(self.no_input_str)
#Collect additional experiment metadata
meta_list_str = input(f"{self.header} [optional] Describe your setup with a few key words (comma-seperated): ")
if meta_list_str != "":
meta_list = meta_list_str.replace(" ", "").split(",")
print(f"{self.header} Info collected: {meta_list}")
for i, data in enumerate(meta_list):
self.metadata[next(self.descriptor)] = meta_list[i]
else:
print(self.no_input_str)
# Collection is complete
self.collected = True
def file_descriptor(self):
"""
Returns a string constructed from meta-deta that functionally describes a directory name.
"""
if self.collected != True:
print(f"{self.header} Warning: No meta-data collection has occured. Generating file descriptor anyway.")
# String generation from list
fd_str = ""
md_list = list(self.metadata.values())
# Finite elements
if self.fd_restrict != None:
restrict = (self.fd_restrict <= len(md_list)) * self.fd_restrict + (self.fd_restrict > len(md_list)) * len(md_list)
md_list = md_list[:restrict]
for md in md_list:
fd_str += str(md)
fd_str += '_'
fd_str = fd_str[:-2]
#String sanatization
fd_str = fd_str.replace(".", "") #Period
fd_str = fd_str.replace(" ", "_") #Space
# generic sanatization
for char in "<>/{}[]~`\\,":
fd_str = fd_str.replace(char, '-')
return fd_str
def metadata_file(self, filename, json=False, yaml=False):
"""
Generates a metadata file for the given filename. File extension also determines save mode.
json (optional) : Forces creation of a json file.
yml (optional) : Forces the creation of a yml file.
Exception is raised if both are `True`. Options override file extenstion.
"""
if self.collected != True:
print(f"{self.header} Warning: No meta-data collection has occured. Generating meta-data file anyway.")
mode = "ascii"
if json and yaml:
raise Exception("For meta-data file generation - conflicting ioptions passed.")
if filename.endswith(".json"):
mode = "json"
elif filename.endswith(".yaml"):
mode = "yaml"
elif json:
mode = "json"
elif yaml:
mode = "yaml"
if mode != "ascii" and not filename.endswith(f".{mode}"):
filename += f".{mode}"
metadata = self.metadata.copy()
if '-' in metadata:
metadata.pop('-', None)
data = {"metadata" : metadata, "time" : self.datetime}
if mode == "ascii":
with open(filename, 'w') as file:
data_str = str(data)
file.write(data_str)
if mode == "json":
import json
with open(filename, 'w') as file:
json.dump(data, file, indent=4)
if mode == "yaml":
import yaml
with open(filename, 'w') as file:
yaml.dump(data, file, indent=4)
def add_question(self, q_str):
"""
Add a question that will be asked to the user
during `metadata.collect(user_interactive=True)` call.
"""
if q_str != "":
self.questions.append(q_str)
def add(self, key, value):
"""
Add key and value pair to metadata.
"""
if value != "":
if key == "":
key = next(self.descriptor)
self.metadata[key] = value
def reset(self):
"""
Resets the metadata structure.
"""
self.header = self.default_header
self.no_input_msg = self.default_no_input_msg
self.metadata = {}
self.datetime = {}
self.questions = []
self.collected = False
self.include_weekday = True
self.fd_restrict = None
self.descriptor = DescriptorGenerator()
#!/usr/bin/env python3
#!/usr/bin/env python
"""
Metadata aggregator
-------------------
Object that is specialized to collect, store, and export metadata for any generic application.
Author : *Yatharth Bhasin* (Github → yatharthb97)
License: *MIT open-source license* (https://opensource.org/licenses/mit-license.php)
Older version [depreciated] – metadata.py:
This piece of software was released on GistHub : TODO
Summary:
--------
Metadata (`metadata2.py`) is the second version of a simple MetaData aggregator object that can be used to collect, store,
and archive metadata information for any generic application. The primary usecase for these objects are instruments that require
some degree of user interaction and simulations. The current version also includes feature to differentiate and unify seperate MetaData objects.
The MetaData can be outputted to either yaml, json, or ascii (a formatted python dictionary dump to a text file). The MetaData can also be used to ...TODO
Documentation:
--------------
Featured methods:
└ collect() : Starts a user-interactive meta-data collection with proper input type deduction.
└ metadata_file(...) : Dumps the meta-data collected into a file with the following options: ascii, yaml, json
└ file_descriptor(...) : Generates a file descriptor (a valid directory/filename) based on the collected metadata and the input arguements
└ add_node(...) : Add another key-value (dictionary) structure within the scope of the current MetaData object.
└ add_que(...) : Add a question for the user, that is prompted when the `collect()` method is called.
Rules of the game:
------------------
+ The software does not check for duplicate field names and just overrides the old value.
+ When using the yaml format for outputting metadata file, aliases and anchors are disabled by default.
Structure of file:
------------------
metadata-file
––-----------------------------------------------
|metadata-object-name (if insert_name=True):
| └── datetime
| └── ...
| └── ...
| └── Metadata structure
| └── ...
| └── ...
| └── ...
|Node1:
| └── datetime
| └── ...
| └── ...
| └── Metadata structure
| └── ...
| └── ...
| └── ...
|Node2:
| └── datetime
| └── ...
| └── ...
| └── Metadata structure
| └── ...
| └── ...
| └── ...
––-----------------------------------------------
TODO:
----
+ Query metaData generation rules for `MetaData.file_descriptor()`
+ add_event_series framework
+ Arbitrary insertion order of `dwt`, while generating file_descriptors in non-exclusive mode. Requires abstraction of the DWT insertion.
+ ISO datetime generation function
+ Event-tags (from comms) ->
class Tag(IntEnum):
time = 0
event_cntr = 1
range = 2
custom = 3
none = 4
+ Depreciate and remove the reset feature.
+ In df_formatter -> Conversion of "yy" to "yyyy" is ill-formed. 19 -> 0019 instead of 2019.
+ For inspiration (Source: https://wiki00.igc.gulbenkian.pt/igc-wiki/doku.php?id=dmp:datamanagement) :
+ Include Package requirements inclusion: https://juliadynamics.github.io/DrWatson.jl/dev/name/#Naming-Simulations-1
The importance of metadata lies in the potential for data interoperability, providing the user with an enhanced version of published data. If you provide information that describes or contextualizes data it can be easily retrieved. 3 main categories can be distinguished:
Descriptive - common fields such as title, author, abstract, keywords which help users to discover online sources through searching and browsing.
Administrative - preservation, rights management, and technical metadata formats (check https://www.dcc.ac.uk/resources/subject-areas/biology)
Structural - how different components relate to one another, such as a schema describing relations between tables in a database.
"""
class MetaData:
default_header = "|| meta-data collector >>" # Default header
default_no_input_msg = "Okay! Keep your secrets!" # Default no input message
def __init__(self, name):
"""
Constructor.
"""
self.name = name
self.header = f" {self.name} -- {self.default_header}" # Header to be used with the
self.no_input_msg = self.default_no_input_msg # Message when no input is given
self.no_input_str = f"{self.header} {self.no_input_msg}" # Header + No input message
# Data
self.metadata = {} # Stores metadata
self.datetime = {} # Stores datetime metadata
self.questions = {} # List of questions to be asked to the user
self.nodes = {} # Other meta-data structures appended in the scope of the current object
# Option flags
self.collected = False # Indicates whether metadata `collect` is called
self.key_gen = MetaData.KeyGen() # Generator for entries with missing key
# OK
def set_header(self, hstring="", name=None):
"""
Changes the header string to passed `hstring`.
"""
if hstring == "":
hstring = self.header.replace("|", "")
hstring = hstring.replace(" >", "")
suffix = ""
if name != None:
suffix = f"{name}'s "
self.header = f"|| {suffix}{hstring} >>"
self.no_input_str = f"{self.header} {self.no_input_msg}"
#OK
def collect(self, user_interactive=True, username_header=False):
"""
Collects the metadata in a user-nteractive Q & A.
user_interactive (optional) : Interact with user to collect information (blocking).
username_header (optional) : Change the header to include the user's name.
"""
# Collect date and time
import datetime
import time
dt = datetime.datetime.now()
self.datetime["year"] = dt.year
self.datetime["month"] = dt.month
self.datetime["day"] = dt.day
self.datetime["hour"] = dt.hour
self.datetime["min"] = dt.minute
self.datetime["sec"] = dt.second
self.datetime["weekday"] = dt.strftime('%A')
self.datetime["timezone"] = list(time.tzname)
self.datetime["rep"] = str(dt)
# Depreciated
#if self.include_weekday:
# self.metadata["weekday"] = self.datetime["weekday"][:3]
# User interactive loop
if user_interactive:
self.metadata["collect_dt"] = self.datetime["rep"]
# Collect user name
user = input(f"{self.header} What is your name? → ")
if user != "":
if username_header:
self.set_header(name=user)
print(f"{self.header} Hello! {user}.")
self.metadata['user'] = user
else:
print(self.no_input_str)
# Ask pre-decided Q & A
for key_que in self.questions:
que = self.questions[key_que]
ans = input(f"{self.header} {key_que} → ")
if ans != "":
print(f"{self.header} recorded: {ans}")
if any([que['prefix'], que['suffix']]) != "":
self.metadata[que["label"]] = que['prefix'] + str(ans) + que['suffix']
else:
self.metadata[que["label"]] = MetaData.type_deduce(ans)
else:
print(self.no_input_str)
# Collect additional experiment metadata
meta_list_str = input(f"{self.header} [optional] Describe your setup with a few more key words (comma-seperated): ")
if meta_list_str != "":
meta_list = meta_list_str.replace(" ", "").split(",") # Remove spaces
meta_list = [MetaData.type_deduce(string) for string in meta_list] # Type deduce
print(f"{self.header} Info collected: {meta_list}")
for i, data in enumerate(meta_list):
self.metadata['descriptors'] = meta_list
else:
print(self.no_input_str)
# Collection is complete
self.collected = True
def file_descriptor(self, exclusive=[], num_fields=None, include=[], exclude=[],\
include_date=True, include_time=True, include_weekday=True,\
date_format="ddmmyyyy", time_format="hhmmss"):
"""
Returns a string constructed from meta-deta that functionally describes a directory/file name.
Rules:
------
1. If `exclusive` is defined, then [num_fields, include, exclude] are ignored. The order defined by `exclusive` is maintained.
2. `exclusive` accepts "time", "date", and "weekday" as valid fields. The datetime used is the collection time (the moment when collect() function is called).
3. When using `include` and `exclude`, `include` takes precedence over the other incase of any conflict.
4. `num_fields` is weakly enforced. It does not include implicit addtion of datetime values (when `include_date`, `include_time`, `include_weekday` is set to `True.
5. `include` field accepts "time", "date", and "weekday" as valid fields. The datetime used is the collection time (the moment when collect() function is called).
6. "date", "time" and "weekday" can be used within include to specify ordering. If not, they are appended to the start.
7. The usual order followed by the `include` - `exclude` mode is the order of metadata insertion in the structure.
"""
# Helper function definations -----------------------------
def fixed_groups(string):
# Used to split continuous format string into blocks: "hhmmss" -> ["hh", "mm", "ss"]
string = string.strip()
lst = []
char = string[0]
start_pos = 0
end_pos = 0
current = string[0]
for i, char in enumerate(string):
if char != current:
end_pos = i
lst.append(string[start_pos:end_pos])
start_pos = end_pos
end_pos = start_pos
current = char
lst.append(string[start_pos:])
return lst
def dt_formatter(key, format_str):
# Formats date, time, and weekday into a specified format.
def __df_formatter__(iter_dict):
print(dir(__df_formatter__.__dir__))
# The shiz. The function that actually gets the job done.
format_str = ''.join(char for char in format_str if char.isalpha())
format_ = fixed_groups(format_str)
lookup_list = [string[0] for string in format_]
for char in iter_dict:
if char in lookup_list:
idx = lookup_list.index(char)
length = len(format_[idx])
format_[idx] = \
str(self.datetime[iter_dict[char]]).rjust(length, '0')
if format_[idx][0] == "0":
format_[idx] = format_[idx][0][:-length]
# Collapse list
format_ = [str(integer) for integer in format_]
final_str = "-".join(format_)
return final_str
if key == "weekday":
return self.datetime['weekday'][:3]
elif key == "time": # Abstract--------------------------------------
# Input is iter_dict
ftime = __df_formatter__({'h':'hour', 'm':'min', 's':'sec'})
final_str = "t-" + ftime
# Abstract--------------------------------------
return final_str
elif key == "date":
fdate = __df_formatter__({'d':'day', 'm':'month', 'y':'year'})
final_str = "d-" + fdate
else:
return
# Helper function definations end --------------------------
# ································ vvvvvv ································
if (not self.collected) and (not self.questions):
print(f"{self.header} Warning: No meta-data collection has occured. Generating file descriptor anyway.")
# List of keys in order to generate the FD
fd_keys = []
fd_str = ""
# ----- fd_keys creation -----------------------------------
# Non-exclusive mode
if not exclusive:
include = list(set(include))
exclude = list(set(exclude))
fd_keys = self.metadata.keys()
fd_keys = list(filter(lambda key: not(key in exclude), fd_keys)) # Remove all exclusions
fd_keys = fd_keys[:num_fields] # Truncate total number of fields
if "collect_dt" in fd_keys:
fd_keys.remove("collect_dt")
if include:
add_keys = [key for key in include if (key in self.metadata and key not in fd_keys)]
fd_keys.extend(add_keys)
# Exclusive mode
else:
exclusive = list(set(exclusive))
add_keys = [key for key in exclusive if (key in self.metadata or key in ["time", "date", "weekday"])]
fd_keys = add_keys
# ----- fd_keys creation ends -----------------------------------
# Date-time parsing & Key to MetaData Conversion
key_to_md = fd_keys
for i, key in enumerate(fd_keys):
if key in self.metadata:
key_to_md[i] = self.metadata[key]
elif key in ["time", "weekday", "date"]:
print(key)
format_map = {"time":time_format, "weekday": None, "date":date_format}
key_to_md[i] = dt_formatter(key, format_map[key])
else:
pass
# Convert to string
for md in key_to_md:
fd_str += str(md)
fd_str += '_'
fd_str = fd_str[:-1]
#String sanatization
fd_str = fd_str.replace(".", "") # Remove Periods
fd_str = fd_str.replace(" ", "-") # Replace Spaces
fd_str = fd_str.replace("\'", "") # Remove any single quotes
fd_str = fd_str.replace("\"", "") # Remove any double quotes
for char in "<>/{}[]~`\\,':": # Other Generic sanatization
fd_str = fd_str.replace(char, '_')
# Re-check date, time, weekday inclusion conditions (Follows append order from the front)
# 1. Time
if include_time and "time" not in fd_keys:
fd_str = dt_formatter("time", time_format) + f"_{fd_str}"
# 2. Weekday
if include_weekday and "weekday" not in fd_keys:
fd_str = dt_formatter("weekday", None) + f"_{fd_str}"
print(fd_str) # debug
# 3. Date
if include_date and "date" not in fd_keys:
fd_str = dt_formatter("date", date_format) + f"_{fd_str}"
# And here it ends!
return fd_str
# OK -> Untouched for 2.0
def metadata_file(self, filename, json=False, yaml=False, insert_name=True):
"""
Generates a metadata file for the given filename. File extension also determines save mode.
json (optional) : Forces creation of a json file.
yml (optional) : Forces the creation of a yml file.
Exception is raised if both are `True`. Options override file extenstion.
"""
if self.collected != True:
print(f"{self.header} Warning: No meta-data collection has occured. Generating meta-data file anyway.")
mode = "ascii"
if json and yaml:
raise Exception("For meta-data file generation - conflicting options passed.")
if filename.endswith(".json"):
mode = "json"
elif filename.endswith(".yaml"):
mode = "yaml"
elif json:
mode = "json"
elif yaml:
mode = "yaml"
if mode != "ascii" and not filename.endswith(f".{mode}"):
filename += f".{mode}"
metadata = self.metadata.copy()
if '-' in metadata:
metadata.pop('-', None)
if insert_name:
name__ = self.name
else:
name__ = None
data = {name__ : metadata}
data[name__]["datetime"] = self.datetime
if self.nodes:
for node in self.nodes:
data[node] = self.nodes[node]
if mode == "ascii":
from pprint import pformat
with open(filename, 'w') as file:
file.write(pformat(data))
if mode == "json":
import json
with open(filename, 'w') as file:
json.dump(data, file, indent=4)
if mode == "yaml":
import yaml
class NoAliasDumper(yaml.SafeDumper):
def ignore_aliases(self, data):
return True
with open(filename, 'w') as file:
yaml.dump(data, file, indent=4, sort_keys=False, Dumper=NoAliasDumper)
# OK
def add_que(self, question, label=None, suffix="", prefix=""):
"""
Add a question that will be asked to the user
during `metadata.collect(user_interactive=True)` call.
"""
if question:
if label == None:
label = question
self.questions[question] = {
"label" : label,
"suffix" : suffix,
"prefix" : prefix
}
#OK
def add(self, key, value, prefix="", suffix=""):
"""
Add key and value pair to metadata.
Use of prefix or suffix results in automatic conversion of the key to string type.
"""
if value != None:
if any([prefix, suffix]) != "":
value = str(prefix) + str(value) + str(suffix)
if key == "":
key = next(self.key_gen)
self.metadata[key] = value
# OK
def add_node(self, metadata_obj, name=None):
"""
Adds another Metadata object or a generic dictionary to the current Metadata structure.
"""
if isinstance(metadata_obj, MetaData):
if name != None:
name_ = name
else:
name_ = metadata_obj.name
self.nodes[name_] = dict(metadata_obj.metadata)
if metadata_obj.collected:
self.nodes[name_]["datetime"] = metadata_obj.datetime
else:
if name == None:
raise Exception("`name` arguement required for object type that is not MetaData.")
self.nodes[name] = dict(metadata_obj) # Just thow the bloody exception if the Type Conversion fails.
# OK
def __add__(self, metadata_obj, name=None):
"""
Operator defination for `add_node()` which is equivalent to `+`.
This should only be used with MetaData objects, and not dictionaries, since name cannot be input.
"""
self.add_node(metadata_obj, name=name)
#NOK [Usecase seems defunt now]
def reset(self):
"""
Resets the metadata structure.
"""
self.header = self.default_header
self.no_input_msg = self.default_no_input_msg
self.metadata = {}
self.datetime = {}
self.questions = {}
self.collected = False
self.include_weekday = True
self.fd_restrict = None
self.key_gen = MetaData.KeyGen()
#OK
def KeyGen():
"""
Generator function that sequentially returns
`keys` for metadatum if none is given by the user.
"""
i = 0
while True:
i = i + 1
yield 'key-' + str(i)
#OK
def type_deduce(string):
"""
This function is used to convert the an input string to its appropriate type.
Type escalation order: Integer -> Float -> Bool -> String
(This function should be delegated to python, but I cannot find the relevant module.)
"""
# Try integer conversion
try:
value = int(string)
return value
except ValueError:
pass
# Try float conversion
try:
value = float(string)
return value
except ValueError:
pass
# Try bool
bool_str = string.strip()
bool_str = bool_str.lower()
if bool_str == "true":
return True
elif bool_str == "false":
return False
else:
pass
# The remaining type is a string so just return it
return str(string)
if __name__ == "__main__":
metad = MetaData("test")
metad.set_header(hstring="olympus")
metad.add_que("whatwhat")
metad.add("type", "video")
metad.collect(username_header=True)
metad.add_node(metad, name="test2")
metad.metadata_file("test.yaml", yaml=True, insert_name=False)
metad.metadata_file("test.json", insert_name=False)
metad.metadata_file("test.txt")
print(metad.file_descriptor())
def add_event_series(self, name, tag_type="timeseries", force=False):
"""
Adds a new events series in the metadata.
name: Name of the event series. Must be unique in the `events` structure. Unless
`force` parameter is set to `True`, in which case the series is actually reset.
tag_type: The type of event series: sequential, timeseries [different types of event tags?]
If the value is a `Callable` type, it is used as a generator function.
force: Force add a event "name" to the `events` structure.
"""
pass
def add_event(self, name, value):
"""
Adds a new event to the metadata structure.
name: The name of the event series. Must be present already in the `events` structure or
a `KeyError` exception error is thrown.
value: The value of the event that is logged.
"""
pass
"""
Type Deductions:
Everything is parsed as a string. Then we deduce for int, float.
Then we try for bool.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment