jthorton/env.yml

## env.yml
name: plot_opt_out
channels:
  - simonboothroyd
  - conda-forge
dependencies:
  - black=22.3.0=pyhd8ed1ab_0
  - bokeh=2.4.2=py39h6e9494a_0
  - boost=1.74.0=py39ha1f3e3e_4
  - boost-cpp=1.74.0=hf3dc895_5
  - brotli=1.0.9=h0d85af4_6
  - brotli-bin=1.0.9=h0d85af4_6
  - bzip2=1.0.8=h0d85af4_4
  - ca-certificates=2022.5.18.1=h033912b_0
  - cairo=1.16.0=he01c77b_1009
  - certifi=2022.5.18.1=py39h6e9494a_0
  - click=8.0.3=py39h6e9494a_1
  - cycler=0.11.0=pyhd8ed1ab_0
  - dataclasses=0.8=pyhc8e2a94_3
  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
  - font-ttf-inconsolata=3.000=h77eed37_0
  - font-ttf-source-code-pro=2.038=h77eed37_0
  - font-ttf-ubuntu=0.83=hab24e00_0
  - fontconfig=2.13.1=h10f422b_1005
  - fonts-conda-ecosystem=1=0
  - fonts-conda-forge=1=0
  - fonttools=4.28.5=py39h89e85a6_0
  - freetype=2.10.4=h4cff582_1
  - gettext=0.19.8.1=hd1a6beb_1008
  - greenlet=1.1.2=py39h9fcab8e_1
  - icu=69.1=he49afe7_0
  - isort=5.10.1=pyhd8ed1ab_0
  - jbig=2.1=h0d85af4_2003
  - jinja2=3.0.3=pyhd8ed1ab_0
  - jpeg=9d=hbcb3906_0
  - kiwisolver=1.3.2=py39hf018cea_1
  - lcms2=2.12=h577c468_0
  - lerc=3.0=he49afe7_0
  - libblas=3.9.0=12_osx64_openblas
  - libbrotlicommon=1.0.9=h0d85af4_6
  - libbrotlidec=1.0.9=h0d85af4_6
  - libbrotlienc=1.0.9=h0d85af4_6
  - libcblas=3.9.0=12_osx64_openblas
  - libcxx=12.0.1=habf9029_1
  - libdeflate=1.8=h0d85af4_0
  - libffi=3.4.2=h0d85af4_5
  - libgfortran=5.0.0=9_3_0_h6c81a4c_23
  - libgfortran5=9.3.0=h6c81a4c_23
  - libglib=2.70.2=hf1fb8c0_1
  - libiconv=1.16=haf1e3a3_0
  - liblapack=3.9.0=12_osx64_openblas
  - libopenblas=0.3.18=openmp_h3351f45_0
  - libpng=1.6.37=h7cec526_2
  - libtiff=4.3.0=hd146c10_2
  - libwebp-base=1.2.1=h0d85af4_0
  - libxml2=2.9.12=h7e28ab6_1
  - libzlib=1.2.11=h9173be1_1013
  - llvm-openmp=12.0.1=hda6cdc1_1
  - lz4-c=1.9.3=he49afe7_1
  - markupsafe=2.0.1=py39h89e85a6_1
  - matplotlib=3.5.1=py39h6e9494a_0
  - matplotlib-base=3.5.1=py39hb07454d_0
  - munkres=1.1.4=pyh9f0ad1d_0
  - mypy_extensions=0.4.3=py39h6e9494a_5
  - ncurses=6.2=h2e338ed_4
  - numpy=1.22.0=py39h9d9ce41_0
  - olefile=0.46=pyh9f0ad1d_1
  - openjpeg=2.4.0=h6e7aa92_1
  - openssl=3.0.3=hfe4f2af_0
  - packaging=21.3=pyhd8ed1ab_0
  - pandas=1.3.5=py39h4d6be9b_0
  - pathspec=0.9.0=pyhd8ed1ab_0
  - pcre=8.45=he49afe7_0
  - pillow=8.4.0=py39he9bb72f_0
  - pip=21.3.1=pyhd8ed1ab_0
  - pixman=0.40.0=hbcb3906_0
  - platformdirs=2.5.1=pyhd8ed1ab_0
  - plotmol=0.0.3=pyhcce984b_0
  - pycairo=1.20.1=py39hbe14034_1
  - pydantic=1.9.0=py39h89e85a6_0
  - pyparsing=3.0.6=pyhd8ed1ab_0
  - python=3.9.9=h38b4d05_0_cpython
  - python-dateutil=2.8.2=pyhd8ed1ab_0
  - python_abi=3.9=2_cp39
  - pytz=2021.3=pyhd8ed1ab_0
  - pyyaml=6.0=py39h89e85a6_3
  - rdkit=2021.09.4=py39h88273a1_0
  - readline=8.1=h05e3726_0
  - reportlab=3.5.68=py39hf37cc50_1
  - setuptools=60.5.0=py39h6e9494a_0
  - six=1.16.0=pyh6c4a22f_0
  - sqlalchemy=1.4.29=py39h89e85a6_0
  - sqlite=3.37.0=h23a322b_0
  - tk=8.6.11=h5dbffcc_1
  - tomli=2.0.1=pyhd8ed1ab_0
  - tornado=6.1=py39h89e85a6_2
  - typed-ast=1.5.4=py39h701faf5_0
  - typing-extensions=4.0.1=hd8ed1ab_0
  - typing_extensions=4.0.1=pyha770c72_0
  - tzdata=2021e=he74cb21_0
  - wheel=0.37.1=pyhd8ed1ab_0
  - xz=5.2.5=haf1e3a3_1
  - yaml=0.2.5=h0d85af4_2
  - zlib=1.2.11=h9173be1_1013
  - zstd=1.5.1=h582d3a0_0
prefix: /Users/joshua/mambaforge/envs/plot_opt_out

## plot_optimise_out.py
# script to plot the results of a density and hvap qubekit fitting in force balance
import os
import re
from collections import defaultdict
from typing import Dict, List, Tuple

import click
import matplotlib.pyplot as plt
import numpy as np
from bokeh.palettes import Spectral4
from bokeh.plotting import Figure, save
from plotmol.plotmol import scatter
from pydantic import BaseModel, Field

custom_tooltip_template = """
<div>
    <div>
        <span>@title</span>
        <span><br>reference:@ref<br>calculated:@prediction</span>
        <img src="@image" ></img>
    </div>
</div>
"""

ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")


class Prediction(BaseModel):
    """
    A class to hold the individual predictions of a property.
    """

    prediction: float = Field(..., description="The value for this prediction.")
    std: float = Field(..., description="The std for this prediction.")


class Property(BaseModel):
    """
    A general class to hold predictions.
    """

    id: str = Field(..., description="The id for the property prediction.")
    smiles: str = Field(
        ..., description="The smiles for the target molecule used in ploting."
    )
    property_name: str = Field(..., description="The property to be predicted.")
    reference: float = Field(
        ..., description="The reference value we are trying to describe."
    )
    predictions: List[Prediction] = Field(
        [], description="The list of predictions for this property."
    )


def parse_block(lines: List[str]) -> Tuple[str, Dict[str, List[float]]]:
    """
    Parse the block of lines into a dictionary of results.

    Args:
        lines: The list of lines which should be parsed.

    Returns:
        A dictionary of the properties extracted from the results.
    """
    data = dict()
    prop_name = lines[0].strip()
    # ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    prop_name = ansi_escape.sub("", prop_name)
    prop_name = prop_name.split()[1:-1]
    prop_name = " ".join(prop_name)
    for line in lines[3:]:
        ls = line.split()
        # data["Temperature K"].append(float(ls[0]))
        # data["Pressure"].append(float(ls[1]))
        # data["Pressure Unit"].append(ls[2])
        data["reference"] = float(ls[3])
        data["prediction"] = float(ls[4])
        data["std"] = float(ls[6])

    return prop_name, data


def parse_targets(optimise_lines) -> List[str]:
    """
    Parse the names of the targets for the FB fitting, we can use this to keep track of how many results we should have per iteration.
    """
    targets = []
    for line in optimise_lines:
        if "Setup for target " in line:
            target = line.split()[5]
            target = target.split("_")[0]
            targets.append(target)

    return targets


def parse_smiles(smiles_file: str) -> Dict[str, str]:
    """
    Parse a smiles file for molecules and labels these must match the labels in the optimise out file.
    """
    smiles_and_labels = dict()
    with open(smiles_file) as smiles:
        for line in smiles.readlines():
            data = line.split()
            if data:
                smiles_and_labels[data[1]] = data[0]
    return smiles_and_labels


def plot_property(
    property_name: str, iteration: int, plot_name: str, data: Dict[str, Property]
):
    """
    Plot the requested property.
    """

    figure = Figure(
        tooltips=custom_tooltip_template,
        title=property_name,
        x_axis_label="Calculated",
        y_axis_label="Reference",
        plot_width=500,
        plot_height=500,
    )

    calculated = []
    reference = []
    smiles = []
    titles = []
    for target_name, predictions in data.items():
        titles.append(target_name)
        reference.append(predictions[property_name].reference)
        smiles.append(predictions[property_name].smiles)
        calculated.append(predictions[property_name].predictions[iteration].prediction)

    scatter(
        figure=figure,
        x=calculated,
        y=reference,
        smiles=smiles,
        legend_label="x",
        marker_size=15,
        custom_column_data={
            "title": titles,
            "prediction": [str(c) for c in calculated],
        },
    )
    figure.legend.location = "top_left"
    figure.legend.click_policy = "hide"
    figure.line([min(calculated), max(calculated)], [min(calculated), max(calculated)])
    save(figure, title=plot_name, filename=plot_name + ".html")


def plot_initial_vs_final_property(
    property_name: str, plot_name: str, data: Dict[str, Property]
):
    """
    Plot the intial and final values on the same plot.
    """
    palette = Spectral4
    figure = Figure(
        tooltips=custom_tooltip_template,
        title=property_name,
        x_axis_label="Calculated",
        y_axis_label="Reference",
        plot_width=500,
        plot_height=500,
    )
    plot_labels = [(0, "Initial force field"), (-1, "Optimised force field")]
    for i, (iteration, label) in enumerate(plot_labels):
        calculated = []
        reference = []
        smiles = []
        titles = []
        for target_name, predictions in data.items():
            titles.append(target_name)
            reference.append(predictions[property_name].reference)
            smiles.append(predictions[property_name].smiles)
            calculated.append(
                predictions[property_name].predictions[iteration].prediction
            )

        scatter(
            figure=figure,
            x=calculated,
            y=reference,
            smiles=smiles,
            legend_label=label,
            marker_size=15,
            marker_color=palette[i],
            custom_column_data={
                "title": titles,
                "prediction": calculated,
                "ref": reference,
            },
        )
    figure.legend.location = "top_left"
    figure.legend.click_policy = "hide"
    figure.line(
        [min(calculated), max(calculated)],
        [min(calculated), max(calculated)],
        line_dash="dashed",
        line_color="black",
        legend_label="x=y",
    )
    save(figure, title=plot_name, filename=plot_name + ".html")


def plot_objective(objective_func: List[float], plot_name: str):
    """
    simple plot of the objective function.
    """
    iterations = [i for i in range(len(objective_func))]
    plt.plot(iterations, objective_func, marker="o")
    plt.xlabel("Iteration")
    plt.ylabel("Objective function")
    plt.savefig(plot_name)
    plt.close()


def plot_rfree(rfree_dict: Dict[str, List[float]], plot_name: str):
    """
    Make a simple plot to show the progression of the Rfree values during the optimisation.
    """
    element_to_colour = {
        "CElement/cfree": "black",
        "NElement/nfree": "royalblue",
        "OElement/ofree": "tomato",
        "HElement/hfree": "silver",
        "XElement/xfree": "darkorange",
        "xalpha/alpha": "brown",
        "xbeta/beta": "olivedrab",
        "FElement/ffree": "deepskyblue",
        "SElement/sfree": "gold",
        "BrElement/brfree": "firebrick",
        "ClElement/clfree": "lime",
        "IElement/ifree": "darkviolet",
        "BElement/bfree": "hotpink",
        "SiElement/sifree": "lightgray",
    }
    iterations = [i for i in range(len(list(rfree_dict.values())[0]))]
    for param_name, values in rfree_dict.items():
        plt.plot(
            iterations,
            values,
            marker="o",
            color=element_to_colour[param_name],
            label=param_name,
        )
    plt.xlabel("Iteration")
    plt.ylabel("Rfree A")
    plt.legend()
    plt.savefig(plot_name)
    plt.close()


def parse_rfree(rfree_lines: List[str]) -> Dict[str, float]:
    """
    Parse the rfree parameters from a forcebalance parameter block and return the values.
    """
    parameters = {}

    for line in rfree_lines:
        if "----------------------------------------------------------" in line:
            return parameters
        else:
            parameter = float(line.split()[2])
            p_name = line.split()[-1]
            parameters[p_name] = parameter


def MUE(experiment_values: List[float], prediction_values: List[float]) -> float:
    """Calculate the MUE for a set of experimental values and corresponding predictions."""
    return np.mean(np.absolute(experiment_values - prediction_values))


def compute_bootstraped_statistics(
    properties: List[Property],
    optimisation_iteration: int,
    bootstrap_iterations: int = 1000,
    interval: float = 0.95,
) -> Dict[str, float]:
    """
    Compute bootstraped statistics for the optimisation.
    """
    # make a list of experimental and predicted values
    experimental_values = np.array([property.reference for property in properties])
    prediction_values = np.array(
        [
            property.predictions[optimisation_iteration].prediction
            for property in properties
        ]
    )
    mue = MUE(
        experiment_values=experimental_values, prediction_values=prediction_values
    )
    samples = len(experimental_values)
    sample_statistics = []
    for _ in range(bootstrap_iterations):
        sample_indicies = np.random.randint(low=0, high=samples, size=samples)
        sample_experiment_values = experimental_values[sample_indicies]
        sample_prediction_values = prediction_values[sample_indicies]

        sample_statistics.append(
            MUE(
                experiment_values=sample_experiment_values,
                prediction_values=sample_prediction_values,
            )
        )

    # Compute the confidence intervals.
    lower_percentile_index = int(bootstrap_iterations * (1 - interval) / 2)
    upper_percentile_index = int(bootstrap_iterations * (1 + interval) / 2)

    sorted_samples = np.sort(sample_statistics)

    return {
        "lower": sorted_samples[lower_percentile_index],
        "higher": sorted_samples[upper_percentile_index],
        "mue": mue,
    }


@click.command()
@click.argument("optimise_file", type=click.Path(exists=True))
@click.argument("model_name", type=click.STRING)
@click.argument(
    "smiles_file", type=click.Path(exists=True, file_okay=True, dir_okay=False)
)
def parse_optimise_out(
    optimise_file: str, model_name: str, smiles_file: str
) -> Dict[str, Dict[str, List[float]]]:
    """
    Parse the optimise outfile for all relavent data to the optimisation.

    Returns:
        A dict of results by molecule target name. Also includes the values of the optimised parameters per iteration under parameters.
    """

    available_properties = ["Density (kg m^-3)", "Enthalpy of Vaporization (kJ mol^-1)"]
    all_results = {}
    molecules_and_labels = parse_smiles(smiles_file=smiles_file)
    # the place to write the data to
    os.makedirs(model_name, exist_ok=True)

    with open(optimise_file) as f:
        lines = f.readlines()

        targets = parse_targets(optimise_lines=lines)
        all_results = dict((target, dict()) for target in targets)
        objective_func = []
        rfree_opt_params = defaultdict(list)
        for name in molecules_and_labels.keys():
            assert name in all_results

        start_line = 0
        reading = False
        iteration = -1
        for i, line in enumerate(lines):
            # sometimes the object rises and so it is re-evaluated at the last point.
            if "Iteration" in line or "Objective function rises!" in line:
                iteration += 1
                # reset
                print("reading data for iteration", line.split()[3])
                # os.makedirs(f"iteration_{iteration}", exist_ok=True)

            for prop in available_properties:
                if prop in line:
                    start_line = i
                    reading = True
            if (
                "--------------------------------------------------------------------------------------------"
                in line
                and reading
            ):
                property_name, data = parse_block(lines[start_line:i])
                # handle the output and save
                target_string = property_name.split()
                target_name = target_string[0]
                target_name = target_name.split("_")[0]
                property_string = " ".join(target_string[1:])
                if property_string not in all_results[target_name]:
                    property_data = Property(
                        id=target_name,
                        smiles=molecules_and_labels[target_name],
                        property_name=property_string,
                        reference=data["reference"],
                        predictions=[
                            Prediction(prediction=data["prediction"], std=data["std"])
                        ],
                    )
                    all_results[target_name][property_string] = property_data
                else:
                    all_results[target_name][property_string].predictions.append(
                        Prediction(prediction=data["prediction"], std=data["std"])
                    )
            elif (
                "Step       |k|        |dk|       |grad|       -=X2=-     Delta(X2)    StepQual"
                in line
            ):
                objective = ansi_escape.sub("", lines[i + 1])
                objective = float(objective.split()[4])
                objective_func.append(objective)
            elif "Physical Parameters (Current + Step = Next)" in line:
                rfree_data = parse_rfree(lines[i + 2 :])
                for key, value in rfree_data.items():
                    rfree_opt_params[key].append(value)
        # now we have all the data plot the initial values
        plot_initial_vs_final_property(
            property_name="Density (kg m^-3)",
            plot_name=os.path.join(model_name, "density"),
            data=all_results,
        )
        plot_initial_vs_final_property(
            property_name="Enthalpy of Vaporization (kJ mol^-1)",
            plot_name=os.path.join(model_name, "hvap"),
            data=all_results,
        )
        plot_objective(
            objective_func, os.path.join(model_name, "objective_function.pdf")
        )
        plot_rfree(rfree_opt_params, os.path.join(model_name, "rfree.pdf"))
        density_predictions = [
            value["Density (kg m^-3)"] for value in all_results.values()
        ]
        hvap_predictions = [
            value["Enthalpy of Vaporization (kJ mol^-1)"]
            for value in all_results.values()
        ]
        density_data_initial = compute_bootstraped_statistics(
            properties=density_predictions, optimisation_iteration=0
        )
        density_data_final = compute_bootstraped_statistics(
            properties=density_predictions, optimisation_iteration=-1
        )
        hvap_data_initial = compute_bootstraped_statistics(
            properties=hvap_predictions, optimisation_iteration=0
        )
        hvap_data_final = compute_bootstraped_statistics(
            properties=hvap_predictions, optimisation_iteration=-1
        )
        with open(os.path.join(model_name, "results.txt"), "w") as output:
            output.write("Results\n")
            output.write(
                f"Initial Density MUE: {density_data_initial['mue']}   (kg m^-3) upper: {density_data_initial['higher']} lower: {density_data_initial['lower']}\n"
            )
            output.write(
                f"Final   Density MUE: {density_data_final['mue']}  (kg m^-3) upper: {density_data_final['higher']} lower: {density_data_final['lower']}\n"
            )
            output.write(
                f"Initial    HVAP MUE: {hvap_data_initial['mue']} (kJ mol^-1) upper: {hvap_data_initial['higher']} lower: {hvap_data_initial['lower']}\n"
            )
            output.write(
                f"Final      HVAP MUE: {hvap_data_final['mue']} (kJ mol^-1) upper: {hvap_data_final['higher']} lower: {hvap_data_final['lower']}\n"
            )


if __name__ == "__main__":
    parse_optimise_out()
	name: plot_opt_out
	channels:
	- simonboothroyd
	- conda-forge
	dependencies:
	- black=22.3.0=pyhd8ed1ab_0
	- bokeh=2.4.2=py39h6e9494a_0
	- boost=1.74.0=py39ha1f3e3e_4
	- boost-cpp=1.74.0=hf3dc895_5
	- brotli=1.0.9=h0d85af4_6
	- brotli-bin=1.0.9=h0d85af4_6
	- bzip2=1.0.8=h0d85af4_4
	- ca-certificates=2022.5.18.1=h033912b_0
	- cairo=1.16.0=he01c77b_1009
	- certifi=2022.5.18.1=py39h6e9494a_0
	- click=8.0.3=py39h6e9494a_1
	- cycler=0.11.0=pyhd8ed1ab_0
	- dataclasses=0.8=pyhc8e2a94_3
	- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
	- font-ttf-inconsolata=3.000=h77eed37_0
	- font-ttf-source-code-pro=2.038=h77eed37_0
	- font-ttf-ubuntu=0.83=hab24e00_0
	- fontconfig=2.13.1=h10f422b_1005
	- fonts-conda-ecosystem=1=0
	- fonts-conda-forge=1=0
	- fonttools=4.28.5=py39h89e85a6_0
	- freetype=2.10.4=h4cff582_1
	- gettext=0.19.8.1=hd1a6beb_1008
	- greenlet=1.1.2=py39h9fcab8e_1
	- icu=69.1=he49afe7_0
	- isort=5.10.1=pyhd8ed1ab_0
	- jbig=2.1=h0d85af4_2003
	- jinja2=3.0.3=pyhd8ed1ab_0
	- jpeg=9d=hbcb3906_0
	- kiwisolver=1.3.2=py39hf018cea_1
	- lcms2=2.12=h577c468_0
	- lerc=3.0=he49afe7_0
	- libblas=3.9.0=12_osx64_openblas
	- libbrotlicommon=1.0.9=h0d85af4_6
	- libbrotlidec=1.0.9=h0d85af4_6
	- libbrotlienc=1.0.9=h0d85af4_6
	- libcblas=3.9.0=12_osx64_openblas
	- libcxx=12.0.1=habf9029_1
	- libdeflate=1.8=h0d85af4_0
	- libffi=3.4.2=h0d85af4_5
	- libgfortran=5.0.0=9_3_0_h6c81a4c_23
	- libgfortran5=9.3.0=h6c81a4c_23
	- libglib=2.70.2=hf1fb8c0_1
	- libiconv=1.16=haf1e3a3_0
	- liblapack=3.9.0=12_osx64_openblas
	- libopenblas=0.3.18=openmp_h3351f45_0
	- libpng=1.6.37=h7cec526_2
	- libtiff=4.3.0=hd146c10_2
	- libwebp-base=1.2.1=h0d85af4_0
	- libxml2=2.9.12=h7e28ab6_1
	- libzlib=1.2.11=h9173be1_1013
	- llvm-openmp=12.0.1=hda6cdc1_1
	- lz4-c=1.9.3=he49afe7_1
	- markupsafe=2.0.1=py39h89e85a6_1
	- matplotlib=3.5.1=py39h6e9494a_0
	- matplotlib-base=3.5.1=py39hb07454d_0
	- munkres=1.1.4=pyh9f0ad1d_0
	- mypy_extensions=0.4.3=py39h6e9494a_5
	- ncurses=6.2=h2e338ed_4
	- numpy=1.22.0=py39h9d9ce41_0
	- olefile=0.46=pyh9f0ad1d_1
	- openjpeg=2.4.0=h6e7aa92_1
	- openssl=3.0.3=hfe4f2af_0
	- packaging=21.3=pyhd8ed1ab_0
	- pandas=1.3.5=py39h4d6be9b_0
	- pathspec=0.9.0=pyhd8ed1ab_0
	- pcre=8.45=he49afe7_0
	- pillow=8.4.0=py39he9bb72f_0
	- pip=21.3.1=pyhd8ed1ab_0
	- pixman=0.40.0=hbcb3906_0
	- platformdirs=2.5.1=pyhd8ed1ab_0
	- plotmol=0.0.3=pyhcce984b_0
	- pycairo=1.20.1=py39hbe14034_1
	- pydantic=1.9.0=py39h89e85a6_0
	- pyparsing=3.0.6=pyhd8ed1ab_0
	- python=3.9.9=h38b4d05_0_cpython
	- python-dateutil=2.8.2=pyhd8ed1ab_0
	- python_abi=3.9=2_cp39
	- pytz=2021.3=pyhd8ed1ab_0
	- pyyaml=6.0=py39h89e85a6_3
	- rdkit=2021.09.4=py39h88273a1_0
	- readline=8.1=h05e3726_0
	- reportlab=3.5.68=py39hf37cc50_1
	- setuptools=60.5.0=py39h6e9494a_0
	- six=1.16.0=pyh6c4a22f_0
	- sqlalchemy=1.4.29=py39h89e85a6_0
	- sqlite=3.37.0=h23a322b_0
	- tk=8.6.11=h5dbffcc_1
	- tomli=2.0.1=pyhd8ed1ab_0
	- tornado=6.1=py39h89e85a6_2
	- typed-ast=1.5.4=py39h701faf5_0
	- typing-extensions=4.0.1=hd8ed1ab_0
	- typing_extensions=4.0.1=pyha770c72_0
	- tzdata=2021e=he74cb21_0
	- wheel=0.37.1=pyhd8ed1ab_0
	- xz=5.2.5=haf1e3a3_1
	- yaml=0.2.5=h0d85af4_2
	- zlib=1.2.11=h9173be1_1013
	- zstd=1.5.1=h582d3a0_0
	prefix: /Users/joshua/mambaforge/envs/plot_opt_out
	# script to plot the results of a density and hvap qubekit fitting in force balance
	import os
	import re
	from collections import defaultdict
	from typing import Dict, List, Tuple

	import click
	import matplotlib.pyplot as plt
	import numpy as np
	from bokeh.palettes import Spectral4
	from bokeh.plotting import Figure, save
	from plotmol.plotmol import scatter
	from pydantic import BaseModel, Field

	custom_tooltip_template = """
	<div>
	<div>
	<span>@title</span>
	<span><br>reference:@ref<br>calculated:@prediction</span>
	<img src="@image" ></img>
	</div>
	</div>
	"""

	ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]\|\[[0-?][ -/][@-~])")


	class Prediction(BaseModel):
	"""
	A class to hold the individual predictions of a property.
	"""

	prediction: float = Field(..., description="The value for this prediction.")
	std: float = Field(..., description="The std for this prediction.")


	class Property(BaseModel):
	"""
	A general class to hold predictions.
	"""

	id: str = Field(..., description="The id for the property prediction.")
	smiles: str = Field(
	..., description="The smiles for the target molecule used in ploting."
	)
	property_name: str = Field(..., description="The property to be predicted.")
	reference: float = Field(
	..., description="The reference value we are trying to describe."
	)
	predictions: List[Prediction] = Field(
	[], description="The list of predictions for this property."
	)


	def parse_block(lines: List[str]) -> Tuple[str, Dict[str, List[float]]]:
	"""
	Parse the block of lines into a dictionary of results.

	Args:
	lines: The list of lines which should be parsed.

	Returns:
	A dictionary of the properties extracted from the results.
	"""
	data = dict()
	prop_name = lines[0].strip()
	# ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]\|\[[0-?][ -/][@-~])')
	prop_name = ansi_escape.sub("", prop_name)
	prop_name = prop_name.split()[1:-1]
	prop_name = " ".join(prop_name)
	for line in lines[3:]:
	ls = line.split()
	# data["Temperature K"].append(float(ls[0]))
	# data["Pressure"].append(float(ls[1]))
	# data["Pressure Unit"].append(ls[2])
	data["reference"] = float(ls[3])
	data["prediction"] = float(ls[4])
	data["std"] = float(ls[6])

	return prop_name, data


	def parse_targets(optimise_lines) -> List[str]:
	"""
	Parse the names of the targets for the FB fitting, we can use this to keep track of how many results we should have per iteration.
	"""
	targets = []
	for line in optimise_lines:
	if "Setup for target " in line:
	target = line.split()[5]
	target = target.split("_")[0]
	targets.append(target)

	return targets


	def parse_smiles(smiles_file: str) -> Dict[str, str]:
	"""
	Parse a smiles file for molecules and labels these must match the labels in the optimise out file.
	"""
	smiles_and_labels = dict()
	with open(smiles_file) as smiles:
	for line in smiles.readlines():
	data = line.split()
	if data:
	smiles_and_labels[data[1]] = data[0]
	return smiles_and_labels


	def plot_property(
	property_name: str, iteration: int, plot_name: str, data: Dict[str, Property]
	):
	"""
	Plot the requested property.
	"""

	figure = Figure(
	tooltips=custom_tooltip_template,
	title=property_name,
	x_axis_label="Calculated",
	y_axis_label="Reference",
	plot_width=500,
	plot_height=500,
	)

	calculated = []
	reference = []
	smiles = []
	titles = []
	for target_name, predictions in data.items():
	titles.append(target_name)
	reference.append(predictions[property_name].reference)
	smiles.append(predictions[property_name].smiles)
	calculated.append(predictions[property_name].predictions[iteration].prediction)

	scatter(
	figure=figure,
	x=calculated,
	y=reference,
	smiles=smiles,
	legend_label="x",
	marker_size=15,
	custom_column_data={
	"title": titles,
	"prediction": [str(c) for c in calculated],
	},
	)
	figure.legend.location = "top_left"
	figure.legend.click_policy = "hide"
	figure.line([min(calculated), max(calculated)], [min(calculated), max(calculated)])
	save(figure, title=plot_name, filename=plot_name + ".html")


	def plot_initial_vs_final_property(
	property_name: str, plot_name: str, data: Dict[str, Property]
	):
	"""
	Plot the intial and final values on the same plot.
	"""
	palette = Spectral4
	figure = Figure(
	tooltips=custom_tooltip_template,
	title=property_name,
	x_axis_label="Calculated",
	y_axis_label="Reference",
	plot_width=500,
	plot_height=500,
	)
	plot_labels = [(0, "Initial force field"), (-1, "Optimised force field")]
	for i, (iteration, label) in enumerate(plot_labels):
	calculated = []
	reference = []
	smiles = []
	titles = []
	for target_name, predictions in data.items():
	titles.append(target_name)
	reference.append(predictions[property_name].reference)
	smiles.append(predictions[property_name].smiles)
	calculated.append(
	predictions[property_name].predictions[iteration].prediction
	)

	scatter(
	figure=figure,
	x=calculated,
	y=reference,
	smiles=smiles,
	legend_label=label,
	marker_size=15,
	marker_color=palette[i],
	custom_column_data={
	"title": titles,
	"prediction": calculated,
	"ref": reference,
	},
	)
	figure.legend.location = "top_left"
	figure.legend.click_policy = "hide"
	figure.line(
	[min(calculated), max(calculated)],
	[min(calculated), max(calculated)],
	line_dash="dashed",
	line_color="black",
	legend_label="x=y",
	)
	save(figure, title=plot_name, filename=plot_name + ".html")


	def plot_objective(objective_func: List[float], plot_name: str):
	"""
	simple plot of the objective function.
	"""
	iterations = [i for i in range(len(objective_func))]
	plt.plot(iterations, objective_func, marker="o")
	plt.xlabel("Iteration")
	plt.ylabel("Objective function")
	plt.savefig(plot_name)
	plt.close()


	def plot_rfree(rfree_dict: Dict[str, List[float]], plot_name: str):
	"""
	Make a simple plot to show the progression of the Rfree values during the optimisation.
	"""
	element_to_colour = {
	"CElement/cfree": "black",
	"NElement/nfree": "royalblue",
	"OElement/ofree": "tomato",
	"HElement/hfree": "silver",
	"XElement/xfree": "darkorange",
	"xalpha/alpha": "brown",
	"xbeta/beta": "olivedrab",
	"FElement/ffree": "deepskyblue",
	"SElement/sfree": "gold",
	"BrElement/brfree": "firebrick",
	"ClElement/clfree": "lime",
	"IElement/ifree": "darkviolet",
	"BElement/bfree": "hotpink",
	"SiElement/sifree": "lightgray",
	}
	iterations = [i for i in range(len(list(rfree_dict.values())[0]))]
	for param_name, values in rfree_dict.items():
	plt.plot(
	iterations,
	values,
	marker="o",
	color=element_to_colour[param_name],
	label=param_name,
	)
	plt.xlabel("Iteration")
	plt.ylabel("Rfree A")
	plt.legend()
	plt.savefig(plot_name)
	plt.close()


	def parse_rfree(rfree_lines: List[str]) -> Dict[str, float]:
	"""
	Parse the rfree parameters from a forcebalance parameter block and return the values.
	"""
	parameters = {}

	for line in rfree_lines:
	if "----------------------------------------------------------" in line:
	return parameters
	else:
	parameter = float(line.split()[2])
	p_name = line.split()[-1]
	parameters[p_name] = parameter


	def MUE(experiment_values: List[float], prediction_values: List[float]) -> float:
	"""Calculate the MUE for a set of experimental values and corresponding predictions."""
	return np.mean(np.absolute(experiment_values - prediction_values))


	def compute_bootstraped_statistics(
	properties: List[Property],
	optimisation_iteration: int,
	bootstrap_iterations: int = 1000,
	interval: float = 0.95,
	) -> Dict[str, float]:
	"""
	Compute bootstraped statistics for the optimisation.
	"""
	# make a list of experimental and predicted values
	experimental_values = np.array([property.reference for property in properties])
	prediction_values = np.array(
	[
	property.predictions[optimisation_iteration].prediction
	for property in properties
	]
	)
	mue = MUE(
	experiment_values=experimental_values, prediction_values=prediction_values
	)
	samples = len(experimental_values)
	sample_statistics = []
	for _ in range(bootstrap_iterations):
	sample_indicies = np.random.randint(low=0, high=samples, size=samples)
	sample_experiment_values = experimental_values[sample_indicies]
	sample_prediction_values = prediction_values[sample_indicies]

	sample_statistics.append(
	MUE(
	experiment_values=sample_experiment_values,
	prediction_values=sample_prediction_values,
	)
	)

	# Compute the confidence intervals.
	lower_percentile_index = int(bootstrap_iterations * (1 - interval) / 2)
	upper_percentile_index = int(bootstrap_iterations * (1 + interval) / 2)

	sorted_samples = np.sort(sample_statistics)

	return {
	"lower": sorted_samples[lower_percentile_index],
	"higher": sorted_samples[upper_percentile_index],
	"mue": mue,
	}


	@click.command()
	@click.argument("optimise_file", type=click.Path(exists=True))
	@click.argument("model_name", type=click.STRING)
	@click.argument(
	"smiles_file", type=click.Path(exists=True, file_okay=True, dir_okay=False)
	)
	def parse_optimise_out(
	optimise_file: str, model_name: str, smiles_file: str
	) -> Dict[str, Dict[str, List[float]]]:
	"""
	Parse the optimise outfile for all relavent data to the optimisation.

	Returns:
	A dict of results by molecule target name. Also includes the values of the optimised parameters per iteration under parameters.
	"""

	available_properties = ["Density (kg m^-3)", "Enthalpy of Vaporization (kJ mol^-1)"]
	all_results = {}
	molecules_and_labels = parse_smiles(smiles_file=smiles_file)
	# the place to write the data to
	os.makedirs(model_name, exist_ok=True)

	with open(optimise_file) as f:
	lines = f.readlines()

	targets = parse_targets(optimise_lines=lines)
	all_results = dict((target, dict()) for target in targets)
	objective_func = []
	rfree_opt_params = defaultdict(list)
	for name in molecules_and_labels.keys():
	assert name in all_results

	start_line = 0
	reading = False
	iteration = -1
	for i, line in enumerate(lines):
	# sometimes the object rises and so it is re-evaluated at the last point.
	if "Iteration" in line or "Objective function rises!" in line:
	iteration += 1
	# reset
	print("reading data for iteration", line.split()[3])
	# os.makedirs(f"iteration_{iteration}", exist_ok=True)

	for prop in available_properties:
	if prop in line:
	start_line = i
	reading = True
	if (
	"--------------------------------------------------------------------------------------------"
	in line
	and reading
	):
	property_name, data = parse_block(lines[start_line:i])
	# handle the output and save
	target_string = property_name.split()
	target_name = target_string[0]
	target_name = target_name.split("_")[0]
	property_string = " ".join(target_string[1:])
	if property_string not in all_results[target_name]:
	property_data = Property(
	id=target_name,
	smiles=molecules_and_labels[target_name],
	property_name=property_string,
	reference=data["reference"],
	predictions=[
	Prediction(prediction=data["prediction"], std=data["std"])
	],
	)
	all_results[target_name][property_string] = property_data
	else:
	all_results[target_name][property_string].predictions.append(
	Prediction(prediction=data["prediction"], std=data["std"])
	)
	elif (
	"Step \|k\| \|dk\| \|grad\| -=X2=- Delta(X2) StepQual"
	in line
	):
	objective = ansi_escape.sub("", lines[i + 1])
	objective = float(objective.split()[4])
	objective_func.append(objective)
	elif "Physical Parameters (Current + Step = Next)" in line:
	rfree_data = parse_rfree(lines[i + 2 :])
	for key, value in rfree_data.items():
	rfree_opt_params[key].append(value)
	# now we have all the data plot the initial values
	plot_initial_vs_final_property(
	property_name="Density (kg m^-3)",
	plot_name=os.path.join(model_name, "density"),
	data=all_results,
	)
	plot_initial_vs_final_property(
	property_name="Enthalpy of Vaporization (kJ mol^-1)",
	plot_name=os.path.join(model_name, "hvap"),
	data=all_results,
	)
	plot_objective(
	objective_func, os.path.join(model_name, "objective_function.pdf")
	)
	plot_rfree(rfree_opt_params, os.path.join(model_name, "rfree.pdf"))
	density_predictions = [
	value["Density (kg m^-3)"] for value in all_results.values()
	]
	hvap_predictions = [
	value["Enthalpy of Vaporization (kJ mol^-1)"]
	for value in all_results.values()
	]
	density_data_initial = compute_bootstraped_statistics(
	properties=density_predictions, optimisation_iteration=0
	)
	density_data_final = compute_bootstraped_statistics(
	properties=density_predictions, optimisation_iteration=-1
	)
	hvap_data_initial = compute_bootstraped_statistics(
	properties=hvap_predictions, optimisation_iteration=0
	)
	hvap_data_final = compute_bootstraped_statistics(
	properties=hvap_predictions, optimisation_iteration=-1
	)
	with open(os.path.join(model_name, "results.txt"), "w") as output:
	output.write("Results\n")
	output.write(
	f"Initial Density MUE: {density_data_initial['mue']} (kg m^-3) upper: {density_data_initial['higher']} lower: {density_data_initial['lower']}\n"
	)
	output.write(
	f"Final Density MUE: {density_data_final['mue']} (kg m^-3) upper: {density_data_final['higher']} lower: {density_data_final['lower']}\n"
	)
	output.write(
	f"Initial HVAP MUE: {hvap_data_initial['mue']} (kJ mol^-1) upper: {hvap_data_initial['higher']} lower: {hvap_data_initial['lower']}\n"
	)
	output.write(
	f"Final HVAP MUE: {hvap_data_final['mue']} (kJ mol^-1) upper: {hvap_data_final['higher']} lower: {hvap_data_final['lower']}\n"
	)


	if __name__ == "__main__":
	parse_optimise_out()