iver56/visqol_docker_wrapper.py

## visqol_docker_wrapper.py
import os
import re
import subprocess
import tempfile
import uuid
from pathlib import Path

import numpy as np
from scipy.io.wavfile import write


def convert_float_samples_to_int16(y, clamp_values=True, dither=True):
    """
    Convert floating-point numpy array of audio samples to int16.
    :param y:
    :param clamp_values: Clip extreme values to the range [-1.0, 1.0]. This can be done to avoid
        integer overflow or underflow, which results in wrap distortion, which sounds worse than
        clipping distortion.
    :param dither: Whether or not to apply dithering. Dithering alleviates quantization noise.
        See https://www.youtube.com/watch?v=zWpWIQw7HWU for an explanation on dither.
        The dithering noise is triangular.
    :return:
    """
    if not issubclass(y.dtype.type, np.floating):
        raise ValueError("input samples not floating-point")

    y_16 = y * np.iinfo(np.int16).max

    if dither:
        y_16 += np.random.triangular(-1, 0, 1, size=y_16.shape)

    if clamp_values:
        y_16[y_16 < np.iinfo(np.int16).min] = np.iinfo(np.int16).min
        y_16[y_16 > np.iinfo(np.int16).max] = np.iinfo(np.int16).max

    return y_16.astype(np.int16)


def calculate_visqol_in_audio_mode(
    degraded_audio: np.ndarray, reference_audio: np.ndarray, sample_rate: int
):
    """
    Given an audio pair (a degraded audio and a reference/target audio),
    return a MOS-LQO (Mean Opinion Score - Listening Quality Objective) score.
    MOS-LQO scores range from 1 (the worst) to 5 (the best).

    This uses VISQOL's "audio mode" (48 kHz), not "speech mode" (16 kHz).
    """
    assert sample_rate == 48000
    assert degraded_audio.ndim == 2
    assert reference_audio.ndim == 2
    assert degraded_audio.shape[0] == 1
    assert reference_audio.shape[0] == 1

    tmp_dir = Path(tempfile.gettempdir())
    degraded_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")
    reference_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")

    write(
        degraded_audio_file_path,
        sample_rate,
        convert_float_samples_to_int16(degraded_audio).T,
    )
    write(
        reference_audio_file_path,
        sample_rate,
        convert_float_samples_to_int16(reference_audio).T,
    )

    command_args = [
        "docker",
        "run",
        "--rm",
        "-t",
        "-v",
        "{}:/data".format(tmp_dir.as_posix()),
        "jonashaag/visqol:v3",
        "--degraded_file",
        "/data/{}".format(degraded_audio_file_path.name),
        "--reference_file",
        "/data/{}".format(reference_audio_file_path.name),
    ]
    visqol_output = subprocess.check_output(command_args, timeout=60.0).decode("utf-8")

    os.remove(degraded_audio_file_path)
    os.remove(reference_audio_file_path)

    regex = re.compile(r"MOS-LQO:\s*(?P<mos_lqo>[0-9.]+)")
    match = regex.search(visqol_output)
    if match:
        return float(match.group("mos_lqo"))
    else:
        raise Exception(
            "Failed to calculate VISQOL - response does not contain MOS-LQO."
            " Actual response: {}".format(visqol_output)
        )
	import os
	import re
	import subprocess
	import tempfile
	import uuid
	from pathlib import Path

	import numpy as np
	from scipy.io.wavfile import write


	def convert_float_samples_to_int16(y, clamp_values=True, dither=True):
	"""
	Convert floating-point numpy array of audio samples to int16.
	:param y:
	:param clamp_values: Clip extreme values to the range [-1.0, 1.0]. This can be done to avoid
	integer overflow or underflow, which results in wrap distortion, which sounds worse than
	clipping distortion.
	:param dither: Whether or not to apply dithering. Dithering alleviates quantization noise.
	See https://www.youtube.com/watch?v=zWpWIQw7HWU for an explanation on dither.
	The dithering noise is triangular.
	:return:
	"""
	if not issubclass(y.dtype.type, np.floating):
	raise ValueError("input samples not floating-point")

	y_16 = y * np.iinfo(np.int16).max

	if dither:
	y_16 += np.random.triangular(-1, 0, 1, size=y_16.shape)

	if clamp_values:
	y_16[y_16 < np.iinfo(np.int16).min] = np.iinfo(np.int16).min
	y_16[y_16 > np.iinfo(np.int16).max] = np.iinfo(np.int16).max

	return y_16.astype(np.int16)


	def calculate_visqol_in_audio_mode(
	degraded_audio: np.ndarray, reference_audio: np.ndarray, sample_rate: int
	):
	"""
	Given an audio pair (a degraded audio and a reference/target audio),
	return a MOS-LQO (Mean Opinion Score - Listening Quality Objective) score.
	MOS-LQO scores range from 1 (the worst) to 5 (the best).

	This uses VISQOL's "audio mode" (48 kHz), not "speech mode" (16 kHz).
	"""
	assert sample_rate == 48000
	assert degraded_audio.ndim == 2
	assert reference_audio.ndim == 2
	assert degraded_audio.shape[0] == 1
	assert reference_audio.shape[0] == 1

	tmp_dir = Path(tempfile.gettempdir())
	degraded_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")
	reference_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")

	write(
	degraded_audio_file_path,
	sample_rate,
	convert_float_samples_to_int16(degraded_audio).T,
	)
	write(
	reference_audio_file_path,
	sample_rate,
	convert_float_samples_to_int16(reference_audio).T,
	)

	command_args = [
	"docker",
	"run",
	"--rm",
	"-t",
	"-v",
	"{}:/data".format(tmp_dir.as_posix()),
	"jonashaag/visqol:v3",
	"--degraded_file",
	"/data/{}".format(degraded_audio_file_path.name),
	"--reference_file",
	"/data/{}".format(reference_audio_file_path.name),
	]
	visqol_output = subprocess.check_output(command_args, timeout=60.0).decode("utf-8")

	os.remove(degraded_audio_file_path)
	os.remove(reference_audio_file_path)

	regex = re.compile(r"MOS-LQO:\s*(?P<mos_lqo>[0-9.]+)")
	match = regex.search(visqol_output)
	if match:
	return float(match.group("mos_lqo"))
	else:
	raise Exception(
	"Failed to calculate VISQOL - response does not contain MOS-LQO."
	" Actual response: {}".format(visqol_output)
	)