moble/X2D.py

## X2D.py
"""Convert RPXMB files to RPDMB

This file can be run either as a script, or by calling its functions.  To run as a script, do
something like this:

    python X2D.py Strain_N2.h5 Strain_N3.h5 Strain_N4.h5 ExtraWaveforms.h5

To call the functions, and assuming that this file is in the same directory as the script using it,
do something like this:

    import X2D
    X2D.x2d("Strain_N2.h5")

Note that the input filenames should be H5 files, but the corresponding JSON files are also used and
updated.

In every case, the output file uses the original file stem appended with "_RPDMB".  For example, the
function call above will create "Strain_N2_RPDMB.h5" an "Strain_N2_RPDMB.json".  You are responsible
for moving those files into their final locations.

For each input file, this script does the following:

  1. Checks if there is any RPXMB data in the input file; if not it does nothing
  2. For each group and dataset that is not in RPXMB format, simply copies it (and its attributes)
     to the new file, ensuring that shuffle and gzip (level 9) compression are enabled.
  3. For each RPXMB-formatted dataset in the input file:
     a. Decompress the original RPXMB data
     b. Removes any modes with ell<2
     c. Copies over the "modifications" entry in the JSON
     d. Copies over scri- and spec-related version info
     e. Saves the data to RPDMB format
     f. Re-loads the RPDMB-formatted file
     g. Checks to make sure various pieces of data have carried over correctly:
        - `t`, `data`, and `frame` are bitwise identical
        - H5 group has same datasets
        - H5 group's critical attributes are set correctly:
          * ell_max
          * ell_min
          * n_times
          * sxs_format
        - JSON file has identical data in "data_info", "modifications", and "validation/n_times"
     h. If any of the checks in the last step failed, print a warning with details

"""


import sys
import warnings
import pathlib
import json
import h5py
import numpy as np
import sxs


copied_version_info_items = ["quaternion", "spherical_functions", "spinsfast", "scri", "spec_version_hist"]

compression_options = {
    "compression": "gzip",
    "compression_opts": 9,
    "shuffle": True,
}

indent = "    "


def sxs_format(obj):
    return getattr(getattr(obj, "parent", {}), "attrs", {}).get("sxs_format", "")


def found_rpxmb(path):
    status = [False]
    with h5py.File(path, "r") as f:
        def func(name, obj):
            if name.endswith("data") and sxs_format(obj) == "rotating_paired_xor_multishuffle_bzip2":
                status[0] = True
                return True  # This should stop the visitems loop
        f.visititems(func)
    return status[0]


def rpxmb2rpdmb(input_h5_path, input_json_path, group, output_h5_path, output_json_path, remove_ell_0_and_1=True):
    print(f"{indent}Converting {input_h5_path}{group}", end =" ")
    sys.stdout.flush()

    w_in = sxs.rpxmb.load(
        f"{input_h5_path}{group}",
        transform_to_inertial=False,
        convert_from_conjugate_pairs=False
    )

    with input_json_path.open("r") as f:
        input_json = json.load(f)

    # Preserve this, as it will get lost when we remove ell modes
    log_frame = w_in.log_frame

    # Copy over any metadata that need to go into the JSON file correctly
    w_in._metadata["modifications"] = w_in.json_data["modifications"]

    # Copy over several "version_info" keys that may be changed incorrectly
    version_info_update = dict()
    if group == "/":
        input_version_info = input_json["version_info"]
    else:
        input_version_info = input_json[group]["version_info"]
    for copied_version_info_item in copied_version_info_items:
        if copied_version_info_item in input_version_info:
            version_info_update[copied_version_info_item] = input_version_info[copied_version_info_item]
    if group != "/" and version_info_update["spec_version_hist"] == []:
        # First, try with the corresponding `h` group's data
        alternate_group = group.replace("Psi4", "h")
        version_info_update["spec_version_hist"] = input_json.get(
            alternate_group, {}).get("version_info", {}).get("spec_version_hist", [])
    if group != "/" and version_info_update["spec_version_hist"] == []:
        # If the above failed, just use the first `h` finite-radius data
        alternate_group = sorted([
            k for k in input_json
            if k.startswith("/rh_FiniteRadii")
            and input_json[k].get("version_info", {}).get("spec_version_hist", []) != []
        ])[0]
        version_info_update["spec_version_hist"] = input_json.get(
            alternate_group, {}).get("version_info", {}).get("spec_version_hist", [])

    if remove_ell_0_and_1:
        w_in = w_in[:, w_in.index(2, -2):]
        w_in._metadata["ell_min"] = 2

    sxs.rpdmb.save(
        w_in,
        f"{output_h5_path}{group}",
        file_write_mode="a",
        allow_existing_group=True,
        log_frame=log_frame,
        verbose=False,
        convert_to_conjugate_pairs=False,
        L2norm_fractional_tolerance=0.0,
        version_info_update=version_info_update,
    )

    w_in.convert_from_conjugate_pairs()
    w_out = sxs.rpdmb.load(f"{output_h5_path}{group}", transform_to_inertial=False)

    differences = []  # Will add to this if any are found

    # Check data as loaded by `sxs`
    if not np.array_equal(w_in.t, w_out.t):
        differences.append("non-identical time values")
    if not np.array_equal(w_in.data, w_out.data):
        differences.append("non-identical data values")
    if not np.array_equal(w_in.frame, w_out.frame):
        differences.append("non-identical frame values")

    # Check structure of h5 group
    with h5py.File(input_h5_path, "r") as f_in, h5py.File(output_h5_path, "r") as f_out:
        g_in = f_in[group]
        g_out = f_out[group]
        if list(g_in) != list(g_out):
            differences.append(f"list(g_in) != list(g_out)")
        if list(g_in.attrs) != list(g_out.attrs):
            differences.append(f"list(g_in.attrs) != list(g_out.attrs)")
        if g_in.attrs["ell_max"] != g_out.attrs["ell_max"]:
            differences.append(f"""g_in.attrs["ell_max"] != g_out.attrs["ell_max"]""")
        if remove_ell_0_and_1:
            if g_out.attrs["ell_min"] != 2:
                differences.append(f"""g_out.attrs["ell_min"] != 2""")
        else:
            if g_in.attrs["ell_min"] != g_out.attrs["ell_min"]:
                differences.append(f"""g_in.attrs["ell_min"] != g_out.attrs["ell_min"]""")
        if g_in.attrs["n_times"] != g_out.attrs["n_times"]:
            differences.append(f"""g_in.attrs["n_times"] != g_out.attrs["n_times"]""")
        if g_out.attrs["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2":
            differences.append(f"""g_out.attrs["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2" """)

    # Check JSON data
    with output_json_path.open("r") as f_out:
        output_json = json.load(f_out)
    if group == "/":
        g_in = input_json
        g_out = output_json
    else:
        g_in = input_json[group]
        g_out = output_json[group]
    if remove_ell_0_and_1:
        g_in["data_info"]["ell_min"] = 2  # Fake this number for easier comparisons
    if g_out["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2":
        differences.append(f"""JSON file's "sxs_format" != "rotating_paired_diff_multishuffle_bzip2" """)
    # if g_in["version_info"]["spec_version_hist"] != g_out["version_info"]["spec_version_hist"]:
    #     differences.append(f"""JSON files' "version_info/spec_version_hist" entries don't match""")
    if g_in["validation"]["n_times"] != g_out["validation"]["n_times"]:
        differences.append(f"""JSON files' "validation/n_times" entries don't match""")
    for key in ["data_info", "modifications"]:
        if g_in[key] != g_out[key]:
            differences.append(f"""JSON files' "{key}" entries don't match""")

    if differences:
        differences = "\n  - ".join(differences)
        print("\n" + "!"*120)
        print("WARNING: non-identical data values in")
        print(f"  {input_h5_path}{group}")
        print(f"  {output_h5_path}{group}")
        print(f"and/or corresponding JSON files")
        print("-"*120)
        print(f"  - {differences}")
        print("!"*120)
        raise RuntimeError()
    else:
        print("✓")


def x2d(filename):
    input_h5_path = pathlib.Path(filename).expanduser().resolve()
    output_h5_path = input_h5_path.with_stem(input_h5_path.stem + "_RPDMB")
    input_json_path = input_h5_path.with_suffix(".json")
    output_json_path = output_h5_path.with_suffix(".json")

    if not found_rpxmb(input_h5_path):
        warning = f"\nNo RPXMB data found in {input_h5_path}; skipping this file."
        warnings.warn(warning)
    else:
        data_to_extrapolate = []

        print(f"""Converting "{input_h5_path}" """)

        with h5py.File(input_h5_path, "r") as f1, h5py.File(output_h5_path, "w") as f2:
            def func(name, obj):
                if name.endswith("data") and sxs_format(obj) == "rotating_paired_xor_multishuffle_bzip2":
                    parent_name = obj.parent.name
                    if parent_name != "/":
                        print(f"{indent}Deferring {name}")
                    data_to_extrapolate.append(parent_name)
                elif name != "Time.dat" and name not in f2:
                    if isinstance(obj, h5py.Dataset) and obj.size > 1:
                        print(f"{indent}Compressing {name}")
                        f2.create_dataset(name, data=obj, **compression_options)
                    else:
                        print(f"{indent}Copying {name}")
                        sys.stdout.flush()
                        sys.stderr.flush()
                        f1.copy(name, f2, name=name, shallow=True)
            f1.visititems(func)

        for group in data_to_extrapolate:
            rpxmb2rpdmb(input_h5_path, input_json_path, group, output_h5_path, output_json_path)

        input_size = input_h5_path.stat().st_size
        output_size = output_h5_path.stat().st_size
        print(f"{indent}Input H5 file size: {input_size:_} B")
        print(f"{indent}Output H5 file size: {output_size:_} B")
        print(f"{indent}X2D compression ratio: {input_size/output_size:.4f}")


if __name__=="__main__":
    for filename in sys.argv[1:]:
        x2d(filename)
	"""Convert RPXMB files to RPDMB

	This file can be run either as a script, or by calling its functions. To run as a script, do
	something like this:

	python X2D.py Strain_N2.h5 Strain_N3.h5 Strain_N4.h5 ExtraWaveforms.h5

	To call the functions, and assuming that this file is in the same directory as the script using it,
	do something like this:

	import X2D
	X2D.x2d("Strain_N2.h5")

	Note that the input filenames should be H5 files, but the corresponding JSON files are also used and
	updated.

	In every case, the output file uses the original file stem appended with "_RPDMB". For example, the
	function call above will create "Strain_N2_RPDMB.h5" an "Strain_N2_RPDMB.json". You are responsible
	for moving those files into their final locations.

	For each input file, this script does the following:

	1. Checks if there is any RPXMB data in the input file; if not it does nothing
	2. For each group and dataset that is not in RPXMB format, simply copies it (and its attributes)
	to the new file, ensuring that shuffle and gzip (level 9) compression are enabled.
	3. For each RPXMB-formatted dataset in the input file:
	a. Decompress the original RPXMB data
	b. Removes any modes with ell<2
	c. Copies over the "modifications" entry in the JSON
	d. Copies over scri- and spec-related version info
	e. Saves the data to RPDMB format
	f. Re-loads the RPDMB-formatted file
	g. Checks to make sure various pieces of data have carried over correctly:
	- `t`, `data`, and `frame` are bitwise identical
	- H5 group has same datasets
	- H5 group's critical attributes are set correctly:
	* ell_max
	* ell_min
	* n_times
	* sxs_format
	- JSON file has identical data in "data_info", "modifications", and "validation/n_times"
	h. If any of the checks in the last step failed, print a warning with details

	"""


	import sys
	import warnings
	import pathlib
	import json
	import h5py
	import numpy as np
	import sxs


	copied_version_info_items = ["quaternion", "spherical_functions", "spinsfast", "scri", "spec_version_hist"]

	compression_options = {
	"compression": "gzip",
	"compression_opts": 9,
	"shuffle": True,
	}

	indent = " "


	def sxs_format(obj):
	return getattr(getattr(obj, "parent", {}), "attrs", {}).get("sxs_format", "")


	def found_rpxmb(path):
	status = [False]
	with h5py.File(path, "r") as f:
	def func(name, obj):
	if name.endswith("data") and sxs_format(obj) == "rotating_paired_xor_multishuffle_bzip2":
	status[0] = True
	return True # This should stop the visitems loop
	f.visititems(func)
	return status[0]


	def rpxmb2rpdmb(input_h5_path, input_json_path, group, output_h5_path, output_json_path, remove_ell_0_and_1=True):
	print(f"{indent}Converting {input_h5_path}{group}", end =" ")
	sys.stdout.flush()

	w_in = sxs.rpxmb.load(
	f"{input_h5_path}{group}",
	transform_to_inertial=False,
	convert_from_conjugate_pairs=False
	)

	with input_json_path.open("r") as f:
	input_json = json.load(f)

	# Preserve this, as it will get lost when we remove ell modes
	log_frame = w_in.log_frame

	# Copy over any metadata that need to go into the JSON file correctly
	w_in._metadata["modifications"] = w_in.json_data["modifications"]

	# Copy over several "version_info" keys that may be changed incorrectly
	version_info_update = dict()
	if group == "/":
	input_version_info = input_json["version_info"]
	else:
	input_version_info = input_json[group]["version_info"]
	for copied_version_info_item in copied_version_info_items:
	if copied_version_info_item in input_version_info:
	version_info_update[copied_version_info_item] = input_version_info[copied_version_info_item]
	if group != "/" and version_info_update["spec_version_hist"] == []:
	# First, try with the corresponding `h` group's data
	alternate_group = group.replace("Psi4", "h")
	version_info_update["spec_version_hist"] = input_json.get(
	alternate_group, {}).get("version_info", {}).get("spec_version_hist", [])
	if group != "/" and version_info_update["spec_version_hist"] == []:
	# If the above failed, just use the first `h` finite-radius data
	alternate_group = sorted([
	k for k in input_json
	if k.startswith("/rh_FiniteRadii")
	and input_json[k].get("version_info", {}).get("spec_version_hist", []) != []
	])[0]
	version_info_update["spec_version_hist"] = input_json.get(
	alternate_group, {}).get("version_info", {}).get("spec_version_hist", [])

	if remove_ell_0_and_1:
	w_in = w_in[:, w_in.index(2, -2):]
	w_in._metadata["ell_min"] = 2

	sxs.rpdmb.save(
	w_in,
	f"{output_h5_path}{group}",
	file_write_mode="a",
	allow_existing_group=True,
	log_frame=log_frame,
	verbose=False,
	convert_to_conjugate_pairs=False,
	L2norm_fractional_tolerance=0.0,
	version_info_update=version_info_update,
	)

	w_in.convert_from_conjugate_pairs()
	w_out = sxs.rpdmb.load(f"{output_h5_path}{group}", transform_to_inertial=False)

	differences = [] # Will add to this if any are found

	# Check data as loaded by `sxs`
	if not np.array_equal(w_in.t, w_out.t):
	differences.append("non-identical time values")
	if not np.array_equal(w_in.data, w_out.data):
	differences.append("non-identical data values")
	if not np.array_equal(w_in.frame, w_out.frame):
	differences.append("non-identical frame values")

	# Check structure of h5 group
	with h5py.File(input_h5_path, "r") as f_in, h5py.File(output_h5_path, "r") as f_out:
	g_in = f_in[group]
	g_out = f_out[group]
	if list(g_in) != list(g_out):
	differences.append(f"list(g_in) != list(g_out)")
	if list(g_in.attrs) != list(g_out.attrs):
	differences.append(f"list(g_in.attrs) != list(g_out.attrs)")
	if g_in.attrs["ell_max"] != g_out.attrs["ell_max"]:
	differences.append(f"""g_in.attrs["ell_max"] != g_out.attrs["ell_max"]""")
	if remove_ell_0_and_1:
	if g_out.attrs["ell_min"] != 2:
	differences.append(f"""g_out.attrs["ell_min"] != 2""")
	else:
	if g_in.attrs["ell_min"] != g_out.attrs["ell_min"]:
	differences.append(f"""g_in.attrs["ell_min"] != g_out.attrs["ell_min"]""")
	if g_in.attrs["n_times"] != g_out.attrs["n_times"]:
	differences.append(f"""g_in.attrs["n_times"] != g_out.attrs["n_times"]""")
	if g_out.attrs["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2":
	differences.append(f"""g_out.attrs["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2" """)

	# Check JSON data
	with output_json_path.open("r") as f_out:
	output_json = json.load(f_out)
	if group == "/":
	g_in = input_json
	g_out = output_json
	else:
	g_in = input_json[group]
	g_out = output_json[group]
	if remove_ell_0_and_1:
	g_in["data_info"]["ell_min"] = 2 # Fake this number for easier comparisons
	if g_out["sxs_format"] != "rotating_paired_diff_multishuffle_bzip2":
	differences.append(f"""JSON file's "sxs_format" != "rotating_paired_diff_multishuffle_bzip2" """)
	# if g_in["version_info"]["spec_version_hist"] != g_out["version_info"]["spec_version_hist"]:
	# differences.append(f"""JSON files' "version_info/spec_version_hist" entries don't match""")
	if g_in["validation"]["n_times"] != g_out["validation"]["n_times"]:
	differences.append(f"""JSON files' "validation/n_times" entries don't match""")
	for key in ["data_info", "modifications"]:
	if g_in[key] != g_out[key]:
	differences.append(f"""JSON files' "{key}" entries don't match""")

	if differences:
	differences = "\n - ".join(differences)
	print("\n" + "!"*120)
	print("WARNING: non-identical data values in")
	print(f" {input_h5_path}{group}")
	print(f" {output_h5_path}{group}")
	print(f"and/or corresponding JSON files")
	print("-"*120)
	print(f" - {differences}")
	print("!"*120)
	raise RuntimeError()
	else:
	print("✓")


	def x2d(filename):
	input_h5_path = pathlib.Path(filename).expanduser().resolve()
	output_h5_path = input_h5_path.with_stem(input_h5_path.stem + "_RPDMB")
	input_json_path = input_h5_path.with_suffix(".json")
	output_json_path = output_h5_path.with_suffix(".json")

	if not found_rpxmb(input_h5_path):
	warning = f"\nNo RPXMB data found in {input_h5_path}; skipping this file."
	warnings.warn(warning)
	else:
	data_to_extrapolate = []

	print(f"""Converting "{input_h5_path}" """)

	with h5py.File(input_h5_path, "r") as f1, h5py.File(output_h5_path, "w") as f2:
	def func(name, obj):
	if name.endswith("data") and sxs_format(obj) == "rotating_paired_xor_multishuffle_bzip2":
	parent_name = obj.parent.name
	if parent_name != "/":
	print(f"{indent}Deferring {name}")
	data_to_extrapolate.append(parent_name)
	elif name != "Time.dat" and name not in f2:
	if isinstance(obj, h5py.Dataset) and obj.size > 1:
	print(f"{indent}Compressing {name}")
	f2.create_dataset(name, data=obj, **compression_options)
	else:
	print(f"{indent}Copying {name}")
	sys.stdout.flush()
	sys.stderr.flush()
	f1.copy(name, f2, name=name, shallow=True)
	f1.visititems(func)

	for group in data_to_extrapolate:
	rpxmb2rpdmb(input_h5_path, input_json_path, group, output_h5_path, output_json_path)

	input_size = input_h5_path.stat().st_size
	output_size = output_h5_path.stat().st_size
	print(f"{indent}Input H5 file size: {input_size:_} B")
	print(f"{indent}Output H5 file size: {output_size:_} B")
	print(f"{indent}X2D compression ratio: {input_size/output_size:.4f}")


	if __name__=="__main__":
	for filename in sys.argv[1:]:
	x2d(filename)