Last active
October 10, 2023 12:47
-
-
Save tam17aki/496a95d859d5a3ad2edbe86f6bf5233f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# メロディ推定のデモンストレーション | |
# brew install timidity | |
# brew install libsndfile | |
# pip install tensorflow | |
# pip install tensorflow_hub | |
# pip install music21 | |
# pip install pydub | |
import math | |
import os | |
import statistics | |
import music21 | |
import numpy as np | |
import tensorflow as tf | |
import tensorflow_hub as hub | |
from pydub import AudioSegment | |
from scipy.io import wavfile | |
# おまじない | |
tf.get_logger().setLevel("ERROR") | |
IN_M4A_FILE = "doremi.m4a" | |
OUT_MIDI_FILE = "output.mid" | |
OUT_WAVE_FILE = "output.wav" | |
MAX_ABS_INT16 = 32768.0 | |
EXPECTED_SAMPLE_RATE = 16000 | |
A4 = 440 | |
C0 = A4 * pow(2, -4.75) | |
CONFIDENCE_THRETH = 0.9 | |
# ノート名: ドレミソラシド + 半音 を加えた12音階 | |
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] | |
def convert_audio_for_model(user_file, output_file="converted_audio_file.wav"): | |
"""オーディオファイルを読み込んでwavに書き出す.""" | |
audio = AudioSegment.from_file(user_file) | |
audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1) | |
audio.export(output_file, format="wav") | |
return output_file | |
def output2hz(pitch_output): | |
"""ピッチ推定結果をHz単位に変換する""" | |
# Constants taken from https://tfhub.dev/google/spice/2 | |
PT_OFFSET = 25.58 | |
PT_SLOPE = 63.07 | |
FMIN = 10.0 | |
BINS_PER_OCTAVE = 12.0 | |
cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET | |
return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE) | |
def hz2offset(freq): | |
"""メロディの周波数 (Hz)から音符表現に変換した際の量子化誤差を計算する.""" | |
# This measures the quantization error for a single note. | |
if freq == 0: # Rests always have zero error. | |
return None | |
# Quantized note. | |
h = round(12 * math.log2(freq / C0)) | |
return 12 * math.log2(freq / C0) - h | |
def quantize_predictions(group, ideal_offset): | |
# Group values are either 0, or a pitch in Hz. | |
non_zero_values = [v for v in group if v != 0] | |
zero_values_count = len(group) - len(non_zero_values) | |
# Create a rest if 80% is silent, otherwise create a note. | |
if zero_values_count > 0.8 * len(group): | |
# Interpret as a rest. Count each dropped note as an error, weighted a bit | |
# worse than a badly sung note (which would 'cost' 0.5). | |
return 0.51 * len(non_zero_values), "Rest" | |
else: | |
# Interpret as note, estimating as mean of non-rest predictions. | |
h = round( | |
statistics.mean( | |
[12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values] | |
) | |
) | |
octave = h // 12 | |
n = h % 12 | |
note = NOTE_NAMES[n] + str(octave) | |
# Quantization error is the total difference from the quantized note. | |
error = sum( | |
[ | |
abs(12 * math.log2(freq / C0) - ideal_offset - h) | |
for freq in non_zero_values | |
] | |
) | |
return error, note | |
def get_quantization_and_error( | |
pitch_outputs_and_rests, | |
predictions_per_eighth, | |
prediction_start_offset, | |
ideal_offset, | |
): | |
# Apply the start offset - we can just add the offset as rests. | |
pitch_outputs_and_rests = [0] * prediction_start_offset + pitch_outputs_and_rests | |
# Collect the predictions for each note (or rest). | |
groups = [ | |
pitch_outputs_and_rests[i : i + predictions_per_eighth] | |
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth) | |
] | |
quantization_error = 0 | |
notes_and_rests = [] | |
for group in groups: | |
error, note_or_rest = quantize_predictions(group, ideal_offset) | |
quantization_error += error | |
notes_and_rests.append(note_or_rest) | |
return quantization_error, notes_and_rests | |
def load_audiofile(file_name): | |
"""オーディオファイルを読み込む.""" | |
converted_audio_file = convert_audio_for_model(file_name) | |
sample_rate, audio_samples = wavfile.read(converted_audio_file, "rb") | |
audio_samples = audio_samples / float(MAX_ABS_INT16) | |
return sample_rate, audio_samples | |
def get_pitch(audio_samples): | |
"""モデルをロードしてピッチ推定する.""" | |
model = hub.load("https://tfhub.dev/google/spice/2") | |
model_output = model.signatures["serving_default"]( | |
tf.constant(audio_samples, tf.float32) | |
) | |
pitch_outputs = model_output["pitch"] # 推定されたピッチ情報 | |
uncertainty_outputs = model_output["uncertainty"] # 推定の不確実度 [0-1] | |
# 不確実度を反転することで確信度へと変換 | |
confidence_outputs = 1.0 - uncertainty_outputs | |
confidence_outputs = list(confidence_outputs) | |
pitch_outputs = [float(x) for x in pitch_outputs] | |
indices = range(len(pitch_outputs)) | |
# 確信度のしきい値(0.9)以上の推定結果を、最終的なピッチ推定結果とする | |
confident_pitch_outputs = [ | |
(i, p) | |
for i, p, c in zip(indices, pitch_outputs, confidence_outputs) | |
if c >= CONFIDENCE_THRETH | |
] | |
pitch_index, pitch_outputs = zip(*confident_pitch_outputs) | |
# 出力にゼロを追加して、歌声がない部分を示す | |
pitch_outputs_and_rests = [ | |
output2hz(p) if c >= CONFIDENCE_THRETH else 0 | |
for _, p, c in zip(indices, pitch_outputs, confidence_outputs) | |
] | |
return pitch_outputs_and_rests | |
def get_ideal_offset(pitch_outputs_and_rests): | |
"""オフセットを推定する.""" | |
offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0] | |
ideal_offset = statistics.mean(offsets) | |
return ideal_offset | |
def get_melody(pitch_outputs_and_rests, ideal_offset): | |
"""休符情報つきのメロディ推定結果を取得する.""" | |
best_error = float("inf") | |
best_notes_and_rests = None | |
best_predictions_per_note = None | |
for predictions_per_note in range(20, 65, 1): | |
for prediction_start_offset in range(predictions_per_note): | |
error, notes_and_rests = get_quantization_and_error( | |
pitch_outputs_and_rests, | |
predictions_per_note, | |
prediction_start_offset, | |
ideal_offset, | |
) | |
if error < best_error: | |
best_error = error | |
best_notes_and_rests = notes_and_rests | |
best_predictions_per_note = predictions_per_note | |
# At this point, best_notes_and_rests contains the best quantization. | |
# Since we don't need to have rests at the beginning, let's remove these: | |
while best_notes_and_rests[0] == "Rest": | |
best_notes_and_rests = best_notes_and_rests[1:] | |
# Also remove silence at the end. | |
while best_notes_and_rests[-1] == "Rest": | |
best_notes_and_rests = best_notes_and_rests[:-1] | |
return best_notes_and_rests, best_predictions_per_note | |
def melody2midi(best_notes_and_rests, best_predictions_per_note, out_midi_file): | |
"""推定された音符を楽譜(MIDI)として書き出す.""" | |
# Creating the sheet music score. | |
sc = music21.stream.Score() | |
# Adjust the speed to match the actual singing. | |
bpm = 60 * 60 / best_predictions_per_note | |
print("bpm: ", bpm) | |
a = music21.tempo.MetronomeMark(number=bpm) | |
sc.insert(0, a) | |
# 推定されたメロディ(音符情報) | |
print(best_notes_and_rests) | |
# 音符情報をMIDIに変換 | |
for snote in best_notes_and_rests: | |
d = "half" | |
if snote == "Rest": | |
sc.append(music21.note.Rest(type=d)) | |
else: | |
sc.append(music21.note.Note(snote, type=d)) | |
# MIDIに保存する | |
sc.write("midi", fp=out_midi_file) | |
def midi2wav(out_midi_file, out_wav_file): | |
"""MIDIをwavに変換して保存する.""" | |
os.system(f"timidity {out_midi_file} -Ow -o {out_wav_file}") | |
def main(): | |
"""メロディ推定を実行する.""" | |
# オーディオファイルを読み込んでwavに書き出す | |
_, audio_samples = load_audiofile(IN_M4A_FILE) | |
# TensorFlowモデルをロードしてピッチ推定を実行する | |
pitch_outputs_and_rests = get_pitch(audio_samples) | |
# オフセットを推定する | |
ideal_offset = get_ideal_offset(pitch_outputs_and_rests) | |
# ピッチ推定結果からメロディを推定する | |
best_notes_and_rests, best_predictions_per_note = get_melody( | |
pitch_outputs_and_rests, ideal_offset | |
) | |
# メロディ推定結果をMIDIに変換する | |
melody2midi(best_notes_and_rests, best_predictions_per_note, OUT_MIDI_FILE) | |
# MIDIをWAVEに変換する | |
midi2wav(OUT_MIDI_FILE, OUT_WAVE_FILE) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment