Skip to content

Instantly share code, notes, and snippets.

Last active Apr 2, 2020
What would you like to do?
ingest xpd chi files

This is code to consume processed data from XPD back into document model

driver: bluesky-mongo-normalized-catalog
metadatastore_db: mongodb://localhost/xpd_auto_202003_mds
asset_registry_db: mongodb://localhost/xpd_auto_202003_ar
driver: bluesky-msgpack-catalog
- "/SOMEPATH/export/*.msgpack"
from pathlib import Path
import zipfile
import pandas
import datetime
import numpy as np
from event_model import compose_run
from collections import defaultdict
from databroker import catalog
root_path = Path("/SOMEPATH/2020-03_xpd_autonomous/")
chi_zip = root_path / "Beamtime Reduction in Q"
excel_path = root_path / "20191125_XPD.xlsx"
def parse_name(inp):
# special case the empty sample position
if "empty" in inp:
return None
# the sample name can have 3 or 4 _
comp, temp, time, batch, coord_number = inp.split("_")
except ValueError:
comp, pristene, batch, coord_number = inp.split("_")
# if the sample is "pristene" default to room temperature and 0 cooking
if pristene == "Pristine":
temp = "25C"
time = "0min"
return None
except ValueError:
return None
# TODO check the post fixes are correct
time = float(time.replace("p", ".")[:-3])
temp = float(temp[:-1])
comp = {comp[:2]: int(comp[2:4]), comp[4:6]: int(comp[6:])}
return {
"temp": temp,
"anneal_time": time,
"batch": tuple(map(int, batch.split("-"))),
"position": tuple(map(int, coord_number.split("-"))),
df = pandas.read_excel(excel_path)
sample_names = set(df["sample"])
data = defaultdict(
lambda: {
"start": None,
"descriptors": defaultdict(list),
"event_pages": defaultdict(list),
"stop": None,
with zipfile.ZipFile(chi_zip) as zipin:
for n in zipin.namelist():
n = Path(n)
_, fname =
for sn in sample_names:
if fname.startswith(sn):
raise ValueError
fname = fname[len(sn) + 1 :]
date_str, _, fname = fname.partition("_")
dt = datetime.datetime.strptime(date_str, "%Y%m%d-%H%M%S")
ts = dt.timestamp()
uid, _, fname = fname.partition("_")
num, _, fname = fname.partition("_")
res, _, ext = fname.partition(".")
md = {
"source_uid": uid,
"iso_time": dt.isoformat(),
"result_type": res,
"result_number": int(num),
"original_path": str(n),
"sample_name": sn,
fin = iter("\n"))
# grab first line of the header which includes in interesting looking path
md["chi_header"] = next(fin).decode("ascii")
# skip 5 lines of comments for humans
for j in range(5):
_, _, num = next(fin).partition(b":")
num = int(num)
md["num_bins"] = num
Q = np.zeros(num)
I = np.zeros(num)
# skip line of comments
for k, ln in enumerate(fin):
if ln.strip():
Q[k], I[k] = map(float, ln.split(b" "))
assert k == num
# first, compose the run start and get the rest of the factories
run_bundle = compose_run(time=ts, metadata=md)
# stash the start document for later, could also just serialize
data[uid]["start"] = run_bundle.start_doc
# create the descriptor and factories
desc_bundle = run_bundle.compose_descriptor(
"Q": {
"dtype": "number",
"shape": [],
"source": "compute",
"units": "angstrom",
"I": {
"dtype": "number",
"shape": [],
"source": "compute",
"units": "arbitrarily",
# stash the descriptor for later use
desc = desc_bundle.descriptor_doc
# construct and event page
evnt_page = desc_bundle.compose_event_page(
data={"Q": Q, "I": I},
# we might make timestamps optional in the future
timestamps={"Q": np.ones_like(Q) * ts, "I": np.ones_like(I) * ts},
time=np.ones_like(Q) * ts,
# there is a PR to fix this in event-model the problem is
# that it is that jsonschema is not identifying that numpy
# arrays are "array" from a validation point of view 🤦
data[uid]["stop"] = run_bundle.compose_stop()
# grab a reference to the catalog (backed by msgpack files)
db = catalog['xpd_auto_202003_msgpack']
# short-cut to get the correct suitcase configured to write where
# the catalog reads from
serialize = db.v1.insert
# loop through the dictionary we built above
for v in data.values():
# insert the start
serialize('start', v['start'])
# for each stream insert the descriptors
# there can be more than one stream
for stream, descs in v['descriptors'].items():
# and each stream can have more than one descriptor
for d in descs:
serialize('descriptor', d)
# loop through and insert all of the event pages
for k, event_pages in v['event_pages'].items():
for page in event_pages:
serialize('event_page', page)
# insert the stop document
serialize('stop', v['stop'])
# # get all of the uids
# uids = list({}))
# # pull _all_ of the RunCatalogs
# data = [db[uid] for uid in uids]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment