tam17aki/detect_melody_demo.py

## detect_melody_demo.py
# メロディ推定のデモンストレーション

# brew install timidity
# brew install libsndfile
# pip install tensorflow
# pip install tensorflow_hub
# pip install music21
# pip install pydub

import math
import os
import statistics

import music21
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from pydub import AudioSegment
from scipy.io import wavfile

# おまじない
tf.get_logger().setLevel("ERROR")

IN_M4A_FILE = "doremi.m4a"
OUT_MIDI_FILE = "output.mid"
OUT_WAVE_FILE = "output.wav"

MAX_ABS_INT16 = 32768.0
EXPECTED_SAMPLE_RATE = 16000
A4 = 440
C0 = A4 * pow(2, -4.75)

CONFIDENCE_THRETH = 0.9

# ノート名：　ドレミソラシド + 半音 を加えた12音階
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]


def convert_audio_for_model(user_file, output_file="converted_audio_file.wav"):
    """オーディオファイルを読み込んでwavに書き出す."""
    audio = AudioSegment.from_file(user_file)
    audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
    audio.export(output_file, format="wav")
    return output_file


def output2hz(pitch_output):
    """ピッチ推定結果をHz単位に変換する"""
    # Constants taken from https://tfhub.dev/google/spice/2
    PT_OFFSET = 25.58
    PT_SLOPE = 63.07
    FMIN = 10.0
    BINS_PER_OCTAVE = 12.0
    cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET
    return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)


def hz2offset(freq):
    """メロディの周波数 (Hz)から音符表現に変換した際の量子化誤差を計算する."""
    # This measures the quantization error for a single note.
    if freq == 0:  # Rests always have zero error.
        return None
    # Quantized note.
    h = round(12 * math.log2(freq / C0))
    return 12 * math.log2(freq / C0) - h


def quantize_predictions(group, ideal_offset):
    # Group values are either 0, or a pitch in Hz.
    non_zero_values = [v for v in group if v != 0]
    zero_values_count = len(group) - len(non_zero_values)

    # Create a rest if 80% is silent, otherwise create a note.
    if zero_values_count > 0.8 * len(group):
        # Interpret as a rest. Count each dropped note as an error, weighted a bit
        # worse than a badly sung note (which would 'cost' 0.5).
        return 0.51 * len(non_zero_values), "Rest"
    else:
        # Interpret as note, estimating as mean of non-rest predictions.
        h = round(
            statistics.mean(
                [12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values]
            )
        )
        octave = h // 12
        n = h % 12
        note = NOTE_NAMES[n] + str(octave)
        # Quantization error is the total difference from the quantized note.
        error = sum(
            [
                abs(12 * math.log2(freq / C0) - ideal_offset - h)
                for freq in non_zero_values
            ]
        )
        return error, note


def get_quantization_and_error(
    pitch_outputs_and_rests,
    predictions_per_eighth,
    prediction_start_offset,
    ideal_offset,
):
    # Apply the start offset - we can just add the offset as rests.
    pitch_outputs_and_rests = [0] * prediction_start_offset + pitch_outputs_and_rests
    # Collect the predictions for each note (or rest).
    groups = [
        pitch_outputs_and_rests[i : i + predictions_per_eighth]
        for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
    ]

    quantization_error = 0

    notes_and_rests = []
    for group in groups:
        error, note_or_rest = quantize_predictions(group, ideal_offset)
        quantization_error += error
        notes_and_rests.append(note_or_rest)

    return quantization_error, notes_and_rests


def load_audiofile(file_name):
    """オーディオファイルを読み込む."""
    converted_audio_file = convert_audio_for_model(file_name)
    sample_rate, audio_samples = wavfile.read(converted_audio_file, "rb")
    audio_samples = audio_samples / float(MAX_ABS_INT16)
    return sample_rate, audio_samples


def get_pitch(audio_samples):
    """モデルをロードしてピッチ推定する."""
    model = hub.load("https://tfhub.dev/google/spice/2")
    model_output = model.signatures["serving_default"](
        tf.constant(audio_samples, tf.float32)
    )
    pitch_outputs = model_output["pitch"]  # 推定されたピッチ情報
    uncertainty_outputs = model_output["uncertainty"]  # 推定の不確実度 [0-1]

    # 不確実度を反転することで確信度へと変換
    confidence_outputs = 1.0 - uncertainty_outputs

    confidence_outputs = list(confidence_outputs)
    pitch_outputs = [float(x) for x in pitch_outputs]
    indices = range(len(pitch_outputs))

    # 確信度のしきい値(0.9)以上の推定結果を、最終的なピッチ推定結果とする
    confident_pitch_outputs = [
        (i, p)
        for i, p, c in zip(indices, pitch_outputs, confidence_outputs)
        if c >= CONFIDENCE_THRETH
    ]
    pitch_index, pitch_outputs = zip(*confident_pitch_outputs)

    # 出力にゼロを追加して、歌声がない部分を示す
    pitch_outputs_and_rests = [
        output2hz(p) if c >= CONFIDENCE_THRETH else 0
        for _, p, c in zip(indices, pitch_outputs, confidence_outputs)
    ]

    return pitch_outputs_and_rests


def get_ideal_offset(pitch_outputs_and_rests):
    """オフセットを推定する."""
    offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0]
    ideal_offset = statistics.mean(offsets)
    return ideal_offset


def get_melody(pitch_outputs_and_rests, ideal_offset):
    """休符情報つきのメロディ推定結果を取得する."""
    best_error = float("inf")
    best_notes_and_rests = None
    best_predictions_per_note = None

    for predictions_per_note in range(20, 65, 1):
        for prediction_start_offset in range(predictions_per_note):
            error, notes_and_rests = get_quantization_and_error(
                pitch_outputs_and_rests,
                predictions_per_note,
                prediction_start_offset,
                ideal_offset,
            )
            if error < best_error:
                best_error = error
                best_notes_and_rests = notes_and_rests
                best_predictions_per_note = predictions_per_note

    # At this point, best_notes_and_rests contains the best quantization.
    # Since we don't need to have rests at the beginning, let's remove these:
    while best_notes_and_rests[0] == "Rest":
        best_notes_and_rests = best_notes_and_rests[1:]
    # Also remove silence at the end.
    while best_notes_and_rests[-1] == "Rest":
        best_notes_and_rests = best_notes_and_rests[:-1]

    return best_notes_and_rests, best_predictions_per_note


def melody2midi(best_notes_and_rests, best_predictions_per_note, out_midi_file):
    """推定された音符を楽譜(MIDI)として書き出す."""
    # Creating the sheet music score.
    sc = music21.stream.Score()
    # Adjust the speed to match the actual singing.
    bpm = 60 * 60 / best_predictions_per_note
    print("bpm: ", bpm)
    a = music21.tempo.MetronomeMark(number=bpm)
    sc.insert(0, a)

    # 推定されたメロディ（音符情報）
    print(best_notes_and_rests)

    # 音符情報をMIDIに変換
    for snote in best_notes_and_rests:
        d = "half"
        if snote == "Rest":
            sc.append(music21.note.Rest(type=d))
        else:
            sc.append(music21.note.Note(snote, type=d))

    # MIDIに保存する
    sc.write("midi", fp=out_midi_file)


def midi2wav(out_midi_file, out_wav_file):
    """MIDIをwavに変換して保存する."""
    os.system(f"timidity {out_midi_file} -Ow -o {out_wav_file}")


def main():
    """メロディ推定を実行する."""

    # オーディオファイルを読み込んでwavに書き出す
    _, audio_samples = load_audiofile(IN_M4A_FILE)

    # TensorFlowモデルをロードしてピッチ推定を実行する
    pitch_outputs_and_rests = get_pitch(audio_samples)

    # オフセットを推定する
    ideal_offset = get_ideal_offset(pitch_outputs_and_rests)

    # ピッチ推定結果からメロディを推定する
    best_notes_and_rests, best_predictions_per_note = get_melody(
        pitch_outputs_and_rests, ideal_offset
    )

    # メロディ推定結果をMIDIに変換する
    melody2midi(best_notes_and_rests, best_predictions_per_note, OUT_MIDI_FILE)

    # MIDIをWAVEに変換する
    midi2wav(OUT_MIDI_FILE, OUT_WAVE_FILE)


if __name__ == "__main__":
    main()
	# メロディ推定のデモンストレーション

	# brew install timidity
	# brew install libsndfile
	# pip install tensorflow
	# pip install tensorflow_hub
	# pip install music21
	# pip install pydub

	import math
	import os
	import statistics

	import music21
	import numpy as np
	import tensorflow as tf
	import tensorflow_hub as hub
	from pydub import AudioSegment
	from scipy.io import wavfile

	# おまじない
	tf.get_logger().setLevel("ERROR")

	IN_M4A_FILE = "doremi.m4a"
	OUT_MIDI_FILE = "output.mid"
	OUT_WAVE_FILE = "output.wav"

	MAX_ABS_INT16 = 32768.0
	EXPECTED_SAMPLE_RATE = 16000
	A4 = 440
	C0 = A4 * pow(2, -4.75)

	CONFIDENCE_THRETH = 0.9

	# ノート名：　ドレミソラシド + 半音を加えた12音階
	NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]


	def convert_audio_for_model(user_file, output_file="converted_audio_file.wav"):
	"""オーディオファイルを読み込んでwavに書き出す."""
	audio = AudioSegment.from_file(user_file)
	audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
	audio.export(output_file, format="wav")
	return output_file


	def output2hz(pitch_output):
	"""ピッチ推定結果をHz単位に変換する"""
	# Constants taken from https://tfhub.dev/google/spice/2
	PT_OFFSET = 25.58
	PT_SLOPE = 63.07
	FMIN = 10.0
	BINS_PER_OCTAVE = 12.0
	cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET
	return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)


	def hz2offset(freq):
	"""メロディの周波数 (Hz)から音符表現に変換した際の量子化誤差を計算する."""
	# This measures the quantization error for a single note.
	if freq == 0: # Rests always have zero error.
	return None
	# Quantized note.
	h = round(12 * math.log2(freq / C0))
	return 12 * math.log2(freq / C0) - h


	def quantize_predictions(group, ideal_offset):
	# Group values are either 0, or a pitch in Hz.
	non_zero_values = [v for v in group if v != 0]
	zero_values_count = len(group) - len(non_zero_values)

	# Create a rest if 80% is silent, otherwise create a note.
	if zero_values_count > 0.8 * len(group):
	# Interpret as a rest. Count each dropped note as an error, weighted a bit
	# worse than a badly sung note (which would 'cost' 0.5).
	return 0.51 * len(non_zero_values), "Rest"
	else:
	# Interpret as note, estimating as mean of non-rest predictions.
	h = round(
	statistics.mean(
	[12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values]
	)
	)
	octave = h // 12
	n = h % 12
	note = NOTE_NAMES[n] + str(octave)
	# Quantization error is the total difference from the quantized note.
	error = sum(
	[
	abs(12 * math.log2(freq / C0) - ideal_offset - h)
	for freq in non_zero_values
	]
	)
	return error, note


	def get_quantization_and_error(
	pitch_outputs_and_rests,
	predictions_per_eighth,
	prediction_start_offset,
	ideal_offset,
	):
	# Apply the start offset - we can just add the offset as rests.
	pitch_outputs_and_rests = [0] * prediction_start_offset + pitch_outputs_and_rests
	# Collect the predictions for each note (or rest).
	groups = [
	pitch_outputs_and_rests[i : i + predictions_per_eighth]
	for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
	]

	quantization_error = 0

	notes_and_rests = []
	for group in groups:
	error, note_or_rest = quantize_predictions(group, ideal_offset)
	quantization_error += error
	notes_and_rests.append(note_or_rest)

	return quantization_error, notes_and_rests


	def load_audiofile(file_name):
	"""オーディオファイルを読み込む."""
	converted_audio_file = convert_audio_for_model(file_name)
	sample_rate, audio_samples = wavfile.read(converted_audio_file, "rb")
	audio_samples = audio_samples / float(MAX_ABS_INT16)
	return sample_rate, audio_samples


	def get_pitch(audio_samples):
	"""モデルをロードしてピッチ推定する."""
	model = hub.load("https://tfhub.dev/google/spice/2")
	model_output = model.signatures["serving_default"](
	tf.constant(audio_samples, tf.float32)
	)
	pitch_outputs = model_output["pitch"] # 推定されたピッチ情報
	uncertainty_outputs = model_output["uncertainty"] # 推定の不確実度 [0-1]

	# 不確実度を反転することで確信度へと変換
	confidence_outputs = 1.0 - uncertainty_outputs

	confidence_outputs = list(confidence_outputs)
	pitch_outputs = [float(x) for x in pitch_outputs]
	indices = range(len(pitch_outputs))

	# 確信度のしきい値(0.9)以上の推定結果を、最終的なピッチ推定結果とする
	confident_pitch_outputs = [
	(i, p)
	for i, p, c in zip(indices, pitch_outputs, confidence_outputs)
	if c >= CONFIDENCE_THRETH
	]
	pitch_index, pitch_outputs = zip(*confident_pitch_outputs)

	# 出力にゼロを追加して、歌声がない部分を示す
	pitch_outputs_and_rests = [
	output2hz(p) if c >= CONFIDENCE_THRETH else 0
	for _, p, c in zip(indices, pitch_outputs, confidence_outputs)
	]

	return pitch_outputs_and_rests


	def get_ideal_offset(pitch_outputs_and_rests):
	"""オフセットを推定する."""
	offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0]
	ideal_offset = statistics.mean(offsets)
	return ideal_offset


	def get_melody(pitch_outputs_and_rests, ideal_offset):
	"""休符情報つきのメロディ推定結果を取得する."""
	best_error = float("inf")
	best_notes_and_rests = None
	best_predictions_per_note = None

	for predictions_per_note in range(20, 65, 1):
	for prediction_start_offset in range(predictions_per_note):
	error, notes_and_rests = get_quantization_and_error(
	pitch_outputs_and_rests,
	predictions_per_note,
	prediction_start_offset,
	ideal_offset,
	)
	if error < best_error:
	best_error = error
	best_notes_and_rests = notes_and_rests
	best_predictions_per_note = predictions_per_note

	# At this point, best_notes_and_rests contains the best quantization.
	# Since we don't need to have rests at the beginning, let's remove these:
	while best_notes_and_rests[0] == "Rest":
	best_notes_and_rests = best_notes_and_rests[1:]
	# Also remove silence at the end.
	while best_notes_and_rests[-1] == "Rest":
	best_notes_and_rests = best_notes_and_rests[:-1]

	return best_notes_and_rests, best_predictions_per_note


	def melody2midi(best_notes_and_rests, best_predictions_per_note, out_midi_file):
	"""推定された音符を楽譜(MIDI)として書き出す."""
	# Creating the sheet music score.
	sc = music21.stream.Score()
	# Adjust the speed to match the actual singing.
	bpm = 60 * 60 / best_predictions_per_note
	print("bpm: ", bpm)
	a = music21.tempo.MetronomeMark(number=bpm)
	sc.insert(0, a)

	# 推定されたメロディ（音符情報）
	print(best_notes_and_rests)

	# 音符情報をMIDIに変換
	for snote in best_notes_and_rests:
	d = "half"
	if snote == "Rest":
	sc.append(music21.note.Rest(type=d))
	else:
	sc.append(music21.note.Note(snote, type=d))

	# MIDIに保存する
	sc.write("midi", fp=out_midi_file)


	def midi2wav(out_midi_file, out_wav_file):
	"""MIDIをwavに変換して保存する."""
	os.system(f"timidity {out_midi_file} -Ow -o {out_wav_file}")


	def main():
	"""メロディ推定を実行する."""

	# オーディオファイルを読み込んでwavに書き出す
	_, audio_samples = load_audiofile(IN_M4A_FILE)

	# TensorFlowモデルをロードしてピッチ推定を実行する
	pitch_outputs_and_rests = get_pitch(audio_samples)

	# オフセットを推定する
	ideal_offset = get_ideal_offset(pitch_outputs_and_rests)

	# ピッチ推定結果からメロディを推定する
	best_notes_and_rests, best_predictions_per_note = get_melody(
	pitch_outputs_and_rests, ideal_offset
	)

	# メロディ推定結果をMIDIに変換する
	melody2midi(best_notes_and_rests, best_predictions_per_note, OUT_MIDI_FILE)

	# MIDIをWAVEに変換する
	midi2wav(OUT_MIDI_FILE, OUT_WAVE_FILE)


	if __name__ == "__main__":
	main()