Skip to content

Instantly share code, notes, and snippets.

@tam17aki
Last active October 10, 2023 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tam17aki/496a95d859d5a3ad2edbe86f6bf5233f to your computer and use it in GitHub Desktop.
Save tam17aki/496a95d859d5a3ad2edbe86f6bf5233f to your computer and use it in GitHub Desktop.
# メロディ推定のデモンストレーション
# brew install timidity
# brew install libsndfile
# pip install tensorflow
# pip install tensorflow_hub
# pip install music21
# pip install pydub
import math
import os
import statistics
import music21
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from pydub import AudioSegment
from scipy.io import wavfile
# おまじない
tf.get_logger().setLevel("ERROR")
IN_M4A_FILE = "doremi.m4a"
OUT_MIDI_FILE = "output.mid"
OUT_WAVE_FILE = "output.wav"
MAX_ABS_INT16 = 32768.0
EXPECTED_SAMPLE_RATE = 16000
A4 = 440
C0 = A4 * pow(2, -4.75)
CONFIDENCE_THRETH = 0.9
# ノート名: ドレミソラシド + 半音 を加えた12音階
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
def convert_audio_for_model(user_file, output_file="converted_audio_file.wav"):
"""オーディオファイルを読み込んでwavに書き出す."""
audio = AudioSegment.from_file(user_file)
audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
audio.export(output_file, format="wav")
return output_file
def output2hz(pitch_output):
"""ピッチ推定結果をHz単位に変換する"""
# Constants taken from https://tfhub.dev/google/spice/2
PT_OFFSET = 25.58
PT_SLOPE = 63.07
FMIN = 10.0
BINS_PER_OCTAVE = 12.0
cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET
return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)
def hz2offset(freq):
"""メロディの周波数 (Hz)から音符表現に変換した際の量子化誤差を計算する."""
# This measures the quantization error for a single note.
if freq == 0: # Rests always have zero error.
return None
# Quantized note.
h = round(12 * math.log2(freq / C0))
return 12 * math.log2(freq / C0) - h
def quantize_predictions(group, ideal_offset):
# Group values are either 0, or a pitch in Hz.
non_zero_values = [v for v in group if v != 0]
zero_values_count = len(group) - len(non_zero_values)
# Create a rest if 80% is silent, otherwise create a note.
if zero_values_count > 0.8 * len(group):
# Interpret as a rest. Count each dropped note as an error, weighted a bit
# worse than a badly sung note (which would 'cost' 0.5).
return 0.51 * len(non_zero_values), "Rest"
else:
# Interpret as note, estimating as mean of non-rest predictions.
h = round(
statistics.mean(
[12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values]
)
)
octave = h // 12
n = h % 12
note = NOTE_NAMES[n] + str(octave)
# Quantization error is the total difference from the quantized note.
error = sum(
[
abs(12 * math.log2(freq / C0) - ideal_offset - h)
for freq in non_zero_values
]
)
return error, note
def get_quantization_and_error(
pitch_outputs_and_rests,
predictions_per_eighth,
prediction_start_offset,
ideal_offset,
):
# Apply the start offset - we can just add the offset as rests.
pitch_outputs_and_rests = [0] * prediction_start_offset + pitch_outputs_and_rests
# Collect the predictions for each note (or rest).
groups = [
pitch_outputs_and_rests[i : i + predictions_per_eighth]
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
]
quantization_error = 0
notes_and_rests = []
for group in groups:
error, note_or_rest = quantize_predictions(group, ideal_offset)
quantization_error += error
notes_and_rests.append(note_or_rest)
return quantization_error, notes_and_rests
def load_audiofile(file_name):
"""オーディオファイルを読み込む."""
converted_audio_file = convert_audio_for_model(file_name)
sample_rate, audio_samples = wavfile.read(converted_audio_file, "rb")
audio_samples = audio_samples / float(MAX_ABS_INT16)
return sample_rate, audio_samples
def get_pitch(audio_samples):
"""モデルをロードしてピッチ推定する."""
model = hub.load("https://tfhub.dev/google/spice/2")
model_output = model.signatures["serving_default"](
tf.constant(audio_samples, tf.float32)
)
pitch_outputs = model_output["pitch"] # 推定されたピッチ情報
uncertainty_outputs = model_output["uncertainty"] # 推定の不確実度 [0-1]
# 不確実度を反転することで確信度へと変換
confidence_outputs = 1.0 - uncertainty_outputs
confidence_outputs = list(confidence_outputs)
pitch_outputs = [float(x) for x in pitch_outputs]
indices = range(len(pitch_outputs))
# 確信度のしきい値(0.9)以上の推定結果を、最終的なピッチ推定結果とする
confident_pitch_outputs = [
(i, p)
for i, p, c in zip(indices, pitch_outputs, confidence_outputs)
if c >= CONFIDENCE_THRETH
]
pitch_index, pitch_outputs = zip(*confident_pitch_outputs)
# 出力にゼロを追加して、歌声がない部分を示す
pitch_outputs_and_rests = [
output2hz(p) if c >= CONFIDENCE_THRETH else 0
for _, p, c in zip(indices, pitch_outputs, confidence_outputs)
]
return pitch_outputs_and_rests
def get_ideal_offset(pitch_outputs_and_rests):
"""オフセットを推定する."""
offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0]
ideal_offset = statistics.mean(offsets)
return ideal_offset
def get_melody(pitch_outputs_and_rests, ideal_offset):
"""休符情報つきのメロディ推定結果を取得する."""
best_error = float("inf")
best_notes_and_rests = None
best_predictions_per_note = None
for predictions_per_note in range(20, 65, 1):
for prediction_start_offset in range(predictions_per_note):
error, notes_and_rests = get_quantization_and_error(
pitch_outputs_and_rests,
predictions_per_note,
prediction_start_offset,
ideal_offset,
)
if error < best_error:
best_error = error
best_notes_and_rests = notes_and_rests
best_predictions_per_note = predictions_per_note
# At this point, best_notes_and_rests contains the best quantization.
# Since we don't need to have rests at the beginning, let's remove these:
while best_notes_and_rests[0] == "Rest":
best_notes_and_rests = best_notes_and_rests[1:]
# Also remove silence at the end.
while best_notes_and_rests[-1] == "Rest":
best_notes_and_rests = best_notes_and_rests[:-1]
return best_notes_and_rests, best_predictions_per_note
def melody2midi(best_notes_and_rests, best_predictions_per_note, out_midi_file):
"""推定された音符を楽譜(MIDI)として書き出す."""
# Creating the sheet music score.
sc = music21.stream.Score()
# Adjust the speed to match the actual singing.
bpm = 60 * 60 / best_predictions_per_note
print("bpm: ", bpm)
a = music21.tempo.MetronomeMark(number=bpm)
sc.insert(0, a)
# 推定されたメロディ(音符情報)
print(best_notes_and_rests)
# 音符情報をMIDIに変換
for snote in best_notes_and_rests:
d = "half"
if snote == "Rest":
sc.append(music21.note.Rest(type=d))
else:
sc.append(music21.note.Note(snote, type=d))
# MIDIに保存する
sc.write("midi", fp=out_midi_file)
def midi2wav(out_midi_file, out_wav_file):
"""MIDIをwavに変換して保存する."""
os.system(f"timidity {out_midi_file} -Ow -o {out_wav_file}")
def main():
"""メロディ推定を実行する."""
# オーディオファイルを読み込んでwavに書き出す
_, audio_samples = load_audiofile(IN_M4A_FILE)
# TensorFlowモデルをロードしてピッチ推定を実行する
pitch_outputs_and_rests = get_pitch(audio_samples)
# オフセットを推定する
ideal_offset = get_ideal_offset(pitch_outputs_and_rests)
# ピッチ推定結果からメロディを推定する
best_notes_and_rests, best_predictions_per_note = get_melody(
pitch_outputs_and_rests, ideal_offset
)
# メロディ推定結果をMIDIに変換する
melody2midi(best_notes_and_rests, best_predictions_per_note, OUT_MIDI_FILE)
# MIDIをWAVEに変換する
midi2wav(OUT_MIDI_FILE, OUT_WAVE_FILE)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment