jesseengel/mel_scaling.py

## mel_scaling.py
# mel spectrum constants.
_MEL_BREAK_FREQUENCY_HERTZ = 700.0
_MEL_HIGH_FREQUENCY_Q = 1127.0


def mel_to_hertz(mel_values):
  """Converts frequencies in `mel_values` from the mel scale to linear scale."""
  return _MEL_BREAK_FREQUENCY_HERTZ * (
      np.exp(np.array(mel_values) / _MEL_HIGH_FREQUENCY_Q) - 1.0)


def hertz_to_mel(frequencies_hertz):
  """Converts frequencies in `frequencies_hertz` in Hertz to the mel scale."""
  return _MEL_HIGH_FREQUENCY_Q * np.log(
      1.0 + (np.array(frequencies_hertz) / _MEL_BREAK_FREQUENCY_HERTZ))


def linear_to_mel_weight_matrix(num_mel_bins=20,
                                num_spectrogram_bins=129,
                                sample_rate=16000,
                                lower_edge_hertz=125.0,
                                upper_edge_hertz=3800.0):
  """Returns a matrix to warp linear scale spectrograms to the mel scale.

  Adapted from tf.signal.linear_to_mel_weight_matrix with a minimum
  band width (in Hz scale) of 1.5 * freq_bin. To preserve accuracy,
  we compute the matrix at float64 precision and then cast to `dtype`
  at the end. This function can be constant folded by graph optimization
  since there are no Tensor inputs.

  Args:
    num_mel_bins: Int, number of output frequency dimensions.
    num_spectrogram_bins: Int, number of input frequency dimensions.
    sample_rate: Int, sample rate of the audio.
    lower_edge_hertz: Float, lowest frequency to consider.
    upper_edge_hertz: Float, highest frequency to consider.

  Returns:
    Numpy float32 matrix of shape [num_spectrogram_bins, num_mel_bins].

  Raises:
    ValueError: Input argument in the wrong range.
  """
  # Validate input arguments
  if num_mel_bins <= 0:
    raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
  if num_spectrogram_bins <= 0:
    raise ValueError(
        'num_spectrogram_bins must be positive. Got: %s' % num_spectrogram_bins)
  if sample_rate <= 0.0:
    raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
  if lower_edge_hertz < 0.0:
    raise ValueError(
        'lower_edge_hertz must be non-negative. Got: %s' % lower_edge_hertz)
  if lower_edge_hertz >= upper_edge_hertz:
    raise ValueError('lower_edge_hertz %.1f >= upper_edge_hertz %.1f' %
                     (lower_edge_hertz, upper_edge_hertz))
  if upper_edge_hertz > sample_rate / 2:
    raise ValueError('upper_edge_hertz must not be larger than the Nyquist '
                     'frequency (sample_rate / 2). Got: %s for sample_rate: %s'
                     % (upper_edge_hertz, sample_rate))

  # HTK excludes the spectrogram DC bin.
  bands_to_zero = 1
  nyquist_hertz = sample_rate / 2.0
  linear_frequencies = np.linspace(
      0.0, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:, np.newaxis]
  # spectrogram_bins_mel = hertz_to_mel(linear_frequencies)

  # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
  # center of each band is the lower and upper edge of the adjacent bands.
  # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
  # num_mel_bins + 2 pieces.
  band_edges_mel = np.linspace(
      hertz_to_mel(lower_edge_hertz), hertz_to_mel(upper_edge_hertz),
      num_mel_bins + 2)

  lower_edge_mel = band_edges_mel[0:-2]
  center_mel = band_edges_mel[1:-1]
  upper_edge_mel = band_edges_mel[2:]

  freq_res = nyquist_hertz / float(num_spectrogram_bins)
  freq_th = 1.5 * freq_res
  for i in range(0, num_mel_bins):
    center_hz = mel_to_hertz(center_mel[i])
    lower_hz = mel_to_hertz(lower_edge_mel[i])
    upper_hz = mel_to_hertz(upper_edge_mel[i])
    if upper_hz - lower_hz < freq_th:
      rhs = 0.5 * freq_th / (center_hz + _MEL_BREAK_FREQUENCY_HERTZ)
      dm = _MEL_HIGH_FREQUENCY_Q * np.log(rhs + np.sqrt(1.0 + rhs**2))
      lower_edge_mel[i] = center_mel[i] - dm
      upper_edge_mel[i] = center_mel[i] + dm

  lower_edge_hz = mel_to_hertz(lower_edge_mel)[np.newaxis, :]
  center_hz = mel_to_hertz(center_mel)[np.newaxis, :]
  upper_edge_hz = mel_to_hertz(upper_edge_mel)[np.newaxis, :]

  # Calculate lower and upper slopes for every spectrogram bin.
  # Line segments are linear in the mel domain, not Hertz.
  lower_slopes = (linear_frequencies - lower_edge_hz) / (
      center_hz - lower_edge_hz)
  upper_slopes = (upper_edge_hz - linear_frequencies) / (
      upper_edge_hz - center_hz)

  # Intersect the line segments with each other and zero.
  mel_weights_matrix = np.maximum(0.0, np.minimum(lower_slopes, upper_slopes))

  # Re-add the zeroed lower bins we sliced out above.
  # [freq, mel]
  mel_weights_matrix = np.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]],
                              'constant')
  return mel_weights_matrix

def _linear_to_mel_matrix(self):
  """Get the mel transformation matrix."""
  num_freq_bins = self._nfft // 2
  lower_edge_hertz = 0.0
  upper_edge_hertz = self._sample_rate / 2.0
  num_mel_bins = num_freq_bins // self._mel_downscale
  return spectral_ops.linear_to_mel_weight_matrix(
      num_mel_bins, num_freq_bins, self._sample_rate, lower_edge_hertz,
      upper_edge_hertz)

def _mel_to_linear_matrix(self):
  """Get the inverse mel transformation matrix."""
  m = self._linear_to_mel_matrix()
  m_t = np.transpose(m)
  p = np.matmul(m, m_t)
  d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)]
  return np.matmul(m_t, np.diag(d))


def specgrams_to_melspecgrams(specgrams):
  """Converts specgrams to melspecgrams.

  Args:
    specgrams: Tensor of log magnitudes and instantaneous frequencies,
      shape [batch, time, freq, 2].

  Returns:
    melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
      shape [batch, time, freq, 2], mel scaling of frequencies.
  """
  logmag = specgrams[:, :, :, 0]
  p = specgrams[:, :, :, 1]

  mag2 = tf.exp(2.0 * logmag)
  phase_angle = tf.cumsum(p * np.pi, axis=-2)

  l2mel = tf.to_float(self._linear_to_mel_matrix())
  logmelmag2 = self._safe_log(tf.tensordot(mag2, l2mel, 1))
  mel_phase_angle = tf.tensordot(phase_angle, l2mel, 1)
  mel_p = spectral_ops.instantaneous_frequency(mel_phase_angle)

  return tf.concat(
      [logmelmag2[:, :, :, tf.newaxis], mel_p[:, :, :, tf.newaxis]], axis=-1)

def melspecgrams_to_specgrams(melspecgrams):
  """Converts melspecgrams to specgrams.

  Args:
    melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
      shape [batch, time, freq, 2], mel scaling of frequencies.

  Returns:
    specgrams: Tensor of log magnitudes and instantaneous frequencies,
      shape [batch, time, freq, 2].
  """
  if self._mel_downscale is None:
    return melspecgrams

  logmelmag2 = melspecgrams[:, :, :, 0]
  mel_p = melspecgrams[:, :, :, 1]

  mel2l = tf.to_float(self._mel_to_linear_matrix())
  mag2 = tf.tensordot(tf.exp(logmelmag2), mel2l, 1)
  logmag = 0.5 * self._safe_log(mag2)
  mel_phase_angle = tf.cumsum(mel_p * np.pi, axis=-2)
  phase_angle = tf.tensordot(mel_phase_angle, mel2l, 1)
  p = spectral_ops.instantaneous_frequency(phase_angle)

  return tf.concat(
      [logmag[:, :, :, tf.newaxis], p[:, :, :, tf.newaxis]], axis=-1)


def instantaneous_frequency(phase_angle, time_axis=-2):
  """Transform a fft tensor from phase angle to instantaneous frequency.

  Unwrap and take the finite difference of the phase. Pad with initial phase to
  keep the tensor the same size.
  Args:
    phase_angle: Tensor of angles in radians. [Batch, Time, Freqs]
    time_axis: Axis over which to unwrap and take finite difference.

  Returns:
    dphase: Instantaneous frequency (derivative of phase). Same size as input.
  """
  phase_unwrapped = unwrap(phase_angle, axis=time_axis)
  dphase = diff(phase_unwrapped, axis=time_axis)

  # Add an initial phase to dphase
  size = phase_unwrapped.get_shape().as_list()
  size[time_axis] = 1
  begin = [0 for unused_s in size]
  phase_slice = tf.slice(phase_unwrapped, begin, size)
  dphase = tf.concat([phase_slice, dphase], axis=time_axis) / np.pi
  return dphase
	# mel spectrum constants.
	_MEL_BREAK_FREQUENCY_HERTZ = 700.0
	_MEL_HIGH_FREQUENCY_Q = 1127.0


	def mel_to_hertz(mel_values):
	"""Converts frequencies in `mel_values` from the mel scale to linear scale."""
	return _MEL_BREAK_FREQUENCY_HERTZ * (
	np.exp(np.array(mel_values) / _MEL_HIGH_FREQUENCY_Q) - 1.0)


	def hertz_to_mel(frequencies_hertz):
	"""Converts frequencies in `frequencies_hertz` in Hertz to the mel scale."""
	return _MEL_HIGH_FREQUENCY_Q * np.log(
	1.0 + (np.array(frequencies_hertz) / _MEL_BREAK_FREQUENCY_HERTZ))


	def linear_to_mel_weight_matrix(num_mel_bins=20,
	num_spectrogram_bins=129,
	sample_rate=16000,
	lower_edge_hertz=125.0,
	upper_edge_hertz=3800.0):
	"""Returns a matrix to warp linear scale spectrograms to the mel scale.

	Adapted from tf.signal.linear_to_mel_weight_matrix with a minimum
	band width (in Hz scale) of 1.5 * freq_bin. To preserve accuracy,
	we compute the matrix at float64 precision and then cast to `dtype`
	at the end. This function can be constant folded by graph optimization
	since there are no Tensor inputs.

	Args:
	num_mel_bins: Int, number of output frequency dimensions.
	num_spectrogram_bins: Int, number of input frequency dimensions.
	sample_rate: Int, sample rate of the audio.
	lower_edge_hertz: Float, lowest frequency to consider.
	upper_edge_hertz: Float, highest frequency to consider.

	Returns:
	Numpy float32 matrix of shape [num_spectrogram_bins, num_mel_bins].

	Raises:
	ValueError: Input argument in the wrong range.
	"""
	# Validate input arguments
	if num_mel_bins <= 0:
	raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
	if num_spectrogram_bins <= 0:
	raise ValueError(
	'num_spectrogram_bins must be positive. Got: %s' % num_spectrogram_bins)
	if sample_rate <= 0.0:
	raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
	if lower_edge_hertz < 0.0:
	raise ValueError(
	'lower_edge_hertz must be non-negative. Got: %s' % lower_edge_hertz)
	if lower_edge_hertz >= upper_edge_hertz:
	raise ValueError('lower_edge_hertz %.1f >= upper_edge_hertz %.1f' %
	(lower_edge_hertz, upper_edge_hertz))
	if upper_edge_hertz > sample_rate / 2:
	raise ValueError('upper_edge_hertz must not be larger than the Nyquist '
	'frequency (sample_rate / 2). Got: %s for sample_rate: %s'
	% (upper_edge_hertz, sample_rate))

	# HTK excludes the spectrogram DC bin.
	bands_to_zero = 1
	nyquist_hertz = sample_rate / 2.0
	linear_frequencies = np.linspace(
	0.0, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:, np.newaxis]
	# spectrogram_bins_mel = hertz_to_mel(linear_frequencies)

	# Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
	# center of each band is the lower and upper edge of the adjacent bands.
	# Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
	# num_mel_bins + 2 pieces.
	band_edges_mel = np.linspace(
	hertz_to_mel(lower_edge_hertz), hertz_to_mel(upper_edge_hertz),
	num_mel_bins + 2)

	lower_edge_mel = band_edges_mel[0:-2]
	center_mel = band_edges_mel[1:-1]
	upper_edge_mel = band_edges_mel[2:]

	freq_res = nyquist_hertz / float(num_spectrogram_bins)
	freq_th = 1.5 * freq_res
	for i in range(0, num_mel_bins):
	center_hz = mel_to_hertz(center_mel[i])
	lower_hz = mel_to_hertz(lower_edge_mel[i])
	upper_hz = mel_to_hertz(upper_edge_mel[i])
	if upper_hz - lower_hz < freq_th:
	rhs = 0.5 * freq_th / (center_hz + _MEL_BREAK_FREQUENCY_HERTZ)
	dm = _MEL_HIGH_FREQUENCY_Q * np.log(rhs + np.sqrt(1.0 + rhs**2))
	lower_edge_mel[i] = center_mel[i] - dm
	upper_edge_mel[i] = center_mel[i] + dm

	lower_edge_hz = mel_to_hertz(lower_edge_mel)[np.newaxis, :]
	center_hz = mel_to_hertz(center_mel)[np.newaxis, :]
	upper_edge_hz = mel_to_hertz(upper_edge_mel)[np.newaxis, :]

	# Calculate lower and upper slopes for every spectrogram bin.
	# Line segments are linear in the mel domain, not Hertz.
	lower_slopes = (linear_frequencies - lower_edge_hz) / (
	center_hz - lower_edge_hz)
	upper_slopes = (upper_edge_hz - linear_frequencies) / (
	upper_edge_hz - center_hz)

	# Intersect the line segments with each other and zero.
	mel_weights_matrix = np.maximum(0.0, np.minimum(lower_slopes, upper_slopes))

	# Re-add the zeroed lower bins we sliced out above.
	# [freq, mel]
	mel_weights_matrix = np.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]],
	'constant')
	return mel_weights_matrix

	def _linear_to_mel_matrix(self):
	"""Get the mel transformation matrix."""
	num_freq_bins = self._nfft // 2
	lower_edge_hertz = 0.0
	upper_edge_hertz = self._sample_rate / 2.0
	num_mel_bins = num_freq_bins // self._mel_downscale
	return spectral_ops.linear_to_mel_weight_matrix(
	num_mel_bins, num_freq_bins, self._sample_rate, lower_edge_hertz,
	upper_edge_hertz)

	def _mel_to_linear_matrix(self):
	"""Get the inverse mel transformation matrix."""
	m = self._linear_to_mel_matrix()
	m_t = np.transpose(m)
	p = np.matmul(m, m_t)
	d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)]
	return np.matmul(m_t, np.diag(d))


	def specgrams_to_melspecgrams(specgrams):
	"""Converts specgrams to melspecgrams.

	Args:
	specgrams: Tensor of log magnitudes and instantaneous frequencies,
	shape [batch, time, freq, 2].

	Returns:
	melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
	shape [batch, time, freq, 2], mel scaling of frequencies.
	"""
	logmag = specgrams[:, :, :, 0]
	p = specgrams[:, :, :, 1]

	mag2 = tf.exp(2.0 * logmag)
	phase_angle = tf.cumsum(p * np.pi, axis=-2)

	l2mel = tf.to_float(self._linear_to_mel_matrix())
	logmelmag2 = self._safe_log(tf.tensordot(mag2, l2mel, 1))
	mel_phase_angle = tf.tensordot(phase_angle, l2mel, 1)
	mel_p = spectral_ops.instantaneous_frequency(mel_phase_angle)

	return tf.concat(
	[logmelmag2[:, :, :, tf.newaxis], mel_p[:, :, :, tf.newaxis]], axis=-1)

	def melspecgrams_to_specgrams(melspecgrams):
	"""Converts melspecgrams to specgrams.

	Args:
	melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
	shape [batch, time, freq, 2], mel scaling of frequencies.

	Returns:
	specgrams: Tensor of log magnitudes and instantaneous frequencies,
	shape [batch, time, freq, 2].
	"""
	if self._mel_downscale is None:
	return melspecgrams

	logmelmag2 = melspecgrams[:, :, :, 0]
	mel_p = melspecgrams[:, :, :, 1]

	mel2l = tf.to_float(self._mel_to_linear_matrix())
	mag2 = tf.tensordot(tf.exp(logmelmag2), mel2l, 1)
	logmag = 0.5 * self._safe_log(mag2)
	mel_phase_angle = tf.cumsum(mel_p * np.pi, axis=-2)
	phase_angle = tf.tensordot(mel_phase_angle, mel2l, 1)
	p = spectral_ops.instantaneous_frequency(phase_angle)

	return tf.concat(
	[logmag[:, :, :, tf.newaxis], p[:, :, :, tf.newaxis]], axis=-1)


	def instantaneous_frequency(phase_angle, time_axis=-2):
	"""Transform a fft tensor from phase angle to instantaneous frequency.

	Unwrap and take the finite difference of the phase. Pad with initial phase to
	keep the tensor the same size.
	Args:
	phase_angle: Tensor of angles in radians. [Batch, Time, Freqs]
	time_axis: Axis over which to unwrap and take finite difference.

	Returns:
	dphase: Instantaneous frequency (derivative of phase). Same size as input.
	"""
	phase_unwrapped = unwrap(phase_angle, axis=time_axis)
	dphase = diff(phase_unwrapped, axis=time_axis)

	# Add an initial phase to dphase
	size = phase_unwrapped.get_shape().as_list()
	size[time_axis] = 1
	begin = [0 for unused_s in size]
	phase_slice = tf.slice(phase_unwrapped, begin, size)
	dphase = tf.concat([phase_slice, dphase], axis=time_axis) / np.pi
	return dphase