RuntimeRacer/test_tone_conversion.py

## test_tone_conversion.py
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter, BaseSpeakerTTS
from speechtoolkit.vc import NS3VCModel

ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
output_dir = 'reference4_variants_2'

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

voice_converter = NS3VCModel(device=device)

os.makedirs(output_dir, exist_ok=True)

reference_speaker = 'resources/reference4.wav'  # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)

from melo.api import TTS

texts = {
    'EN_V1': "Finally, we will be together.",
    'EN': "Did you ever hear a folk tale about a giant turtle?",
    'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?",  # The newest English base speaker model
    'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
    'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.",
    'ZH': "在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。",
    'JP': "彼は毎朝ジョギングをして体を健康に保っています。",
    'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.",
}

src_path = f'{output_dir}/tmp.wav'

# Speed is adjustable
speed = 1.0

for language, text in texts.items():
    if language == "EN_V1":
        styles = ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']
        en_ckpt_base = 'checkpoints/base_speakers/EN'
        en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
        en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
        en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
        en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')

        for style in styles:
            if style == 'default':
                source_se = en_source_default_se
            else:
                source_se = en_source_style_se

            # Override for different sentences per emotion
            style_texts = {
                'angry': "I cannot let myself get too close to you. If I did, I might end up wanting to kill you.",
                'whispering': "Your love is both a curse and a blessing, but it's mine to bear.",
                'sad': "Despite the chaos and darkness that surrounds me, my love for you burns like a twisted wildfire.",
                'shouting': "You have to submit. Just do what i tell you to do and know your place."
            }

            if style in style_texts.keys():
                text = style_texts[style]

            # Render with V1 model
            en_base_speaker_tts.tts(text, src_path, speaker=style, language='English', speed=speed)
            save_path_intermediate = f'{output_dir}/output_v1_en_{style}_intermediate.wav'
            save_path_tc = f'{output_dir}/output_v1_en_{style}_tc.wav'
            save_path_vc = f'{output_dir}/output_v1_en_{style}_vc.wav'
            save_path_tc_vc = f'{output_dir}/output_v1_en_{style}_tc_vc.wav'
            save_path_vc_tc = f'{output_dir}/output_v1_en_{style}_vc_tc.wav'

            # Run the tone color converter
            encode_message = "@MyShell"
            # Variant 1 -> Tone Conversion
            tone_color_converter.convert(
                audio_src_path=src_path,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_tc,
                message=encode_message)
            # Variant 2 -> Voice Conversion
            voice_converter.infer_file(
                src_path,
                reference_speaker,
                save_path_vc
            )
            # Variant 3 -> Tone Conversion -> Voice Conversion
            tone_color_converter.convert(
                audio_src_path=src_path,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_intermediate,
                message=encode_message)
            voice_converter.infer_file(
                save_path_intermediate,
                reference_speaker,
                save_path_tc_vc
            )
            # Variant 4 -> Voice Conversion -> Tone Conversion
            voice_converter.infer_file(
                src_path,
                reference_speaker,
                save_path_intermediate
            )
            tone_color_converter.convert(
                audio_src_path=save_path_intermediate,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_vc_tc,
                message=encode_message)

    else:
        model = TTS(language=language, device=device)
        speaker_ids = model.hps.data.spk2id

        for speaker_key in speaker_ids.keys():
            speaker_id = speaker_ids[speaker_key]
            speaker_key = speaker_key.lower().replace('_', '-')

            source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
            model.tts_to_file(text, speaker_id, src_path, speed=speed)
            save_path_intermediate = f'{output_dir}/output_v2_{speaker_key}_intermediate.wav'
            save_path_tc = f'{output_dir}/output_v2_{speaker_key}_tc.wav'
            save_path_vc = f'{output_dir}/output_v2_{speaker_key}_vc.wav'
            save_path_tc_vc = f'{output_dir}/output_v2_{speaker_key}_tc_vc.wav'
            save_path_vc_tc = f'{output_dir}/output_v2_{speaker_key}_vc_tc.wav'

            # Run the tone color converter
            encode_message = "@MyShell"
            # Variant 1 -> Tone Conversion
            tone_color_converter.convert(
                audio_src_path=src_path,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_tc,
                message=encode_message)
            # Variant 2 -> Voice Conversion
            voice_converter.infer_file(
                src_path,
                reference_speaker,
                save_path_vc
            )
            # Variant 3 -> Tone Conversion -> Voice Conversion
            tone_color_converter.convert(
                audio_src_path=src_path,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_intermediate,
                message=encode_message)
            voice_converter.infer_file(
                save_path_intermediate,
                reference_speaker,
                save_path_tc_vc
            )
            # Variant 4 -> Voice Conversion -> Tone Conversion
            encode_message = "@MyShell"
            voice_converter.infer_file(
                src_path,
                reference_speaker,
                save_path_intermediate
            )
            tone_color_converter.convert(
                audio_src_path=save_path_intermediate,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path_vc_tc,
                message=encode_message)
	import os
	import torch
	from openvoice import se_extractor
	from openvoice.api import ToneColorConverter, BaseSpeakerTTS
	from speechtoolkit.vc import NS3VCModel

	ckpt_converter = 'checkpoints_v2/converter'
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	#device = "cpu"
	output_dir = 'reference4_variants_2'

	tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
	tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

	voice_converter = NS3VCModel(device=device)

	os.makedirs(output_dir, exist_ok=True)

	reference_speaker = 'resources/reference4.wav' # This is the voice you want to clone
	target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)

	from melo.api import TTS

	texts = {
	'EN_V1': "Finally, we will be together.",
	'EN': "Did you ever hear a folk tale about a giant turtle?",
	'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?", # The newest English base speaker model
	'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
	'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.",
	'ZH': "在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。",
	'JP': "彼は毎朝ジョギングをして体を健康に保っています。",
	'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.",
	}

	src_path = f'{output_dir}/tmp.wav'

	# Speed is adjustable
	speed = 1.0

	for language, text in texts.items():
	if language == "EN_V1":
	styles = ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']
	en_ckpt_base = 'checkpoints/base_speakers/EN'
	en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
	en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
	en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
	en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')

	for style in styles:
	if style == 'default':
	source_se = en_source_default_se
	else:
	source_se = en_source_style_se

	# Override for different sentences per emotion
	style_texts = {
	'angry': "I cannot let myself get too close to you. If I did, I might end up wanting to kill you.",
	'whispering': "Your love is both a curse and a blessing, but it's mine to bear.",
	'sad': "Despite the chaos and darkness that surrounds me, my love for you burns like a twisted wildfire.",
	'shouting': "You have to submit. Just do what i tell you to do and know your place."
	}

	if style in style_texts.keys():
	text = style_texts[style]

	# Render with V1 model
	en_base_speaker_tts.tts(text, src_path, speaker=style, language='English', speed=speed)
	save_path_intermediate = f'{output_dir}/output_v1_en_{style}_intermediate.wav'
	save_path_tc = f'{output_dir}/output_v1_en_{style}_tc.wav'
	save_path_vc = f'{output_dir}/output_v1_en_{style}_vc.wav'
	save_path_tc_vc = f'{output_dir}/output_v1_en_{style}_tc_vc.wav'
	save_path_vc_tc = f'{output_dir}/output_v1_en_{style}_vc_tc.wav'

	# Run the tone color converter
	encode_message = "@MyShell"
	# Variant 1 -> Tone Conversion
	tone_color_converter.convert(
	audio_src_path=src_path,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_tc,
	message=encode_message)
	# Variant 2 -> Voice Conversion
	voice_converter.infer_file(
	src_path,
	reference_speaker,
	save_path_vc
	)
	# Variant 3 -> Tone Conversion -> Voice Conversion
	tone_color_converter.convert(
	audio_src_path=src_path,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_intermediate,
	message=encode_message)
	voice_converter.infer_file(
	save_path_intermediate,
	reference_speaker,
	save_path_tc_vc
	)
	# Variant 4 -> Voice Conversion -> Tone Conversion
	voice_converter.infer_file(
	src_path,
	reference_speaker,
	save_path_intermediate
	)
	tone_color_converter.convert(
	audio_src_path=save_path_intermediate,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_vc_tc,
	message=encode_message)

	else:
	model = TTS(language=language, device=device)
	speaker_ids = model.hps.data.spk2id

	for speaker_key in speaker_ids.keys():
	speaker_id = speaker_ids[speaker_key]
	speaker_key = speaker_key.lower().replace('_', '-')

	source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
	model.tts_to_file(text, speaker_id, src_path, speed=speed)
	save_path_intermediate = f'{output_dir}/output_v2_{speaker_key}_intermediate.wav'
	save_path_tc = f'{output_dir}/output_v2_{speaker_key}_tc.wav'
	save_path_vc = f'{output_dir}/output_v2_{speaker_key}_vc.wav'
	save_path_tc_vc = f'{output_dir}/output_v2_{speaker_key}_tc_vc.wav'
	save_path_vc_tc = f'{output_dir}/output_v2_{speaker_key}_vc_tc.wav'

	# Run the tone color converter
	encode_message = "@MyShell"
	# Variant 1 -> Tone Conversion
	tone_color_converter.convert(
	audio_src_path=src_path,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_tc,
	message=encode_message)
	# Variant 2 -> Voice Conversion
	voice_converter.infer_file(
	src_path,
	reference_speaker,
	save_path_vc
	)
	# Variant 3 -> Tone Conversion -> Voice Conversion
	tone_color_converter.convert(
	audio_src_path=src_path,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_intermediate,
	message=encode_message)
	voice_converter.infer_file(
	save_path_intermediate,
	reference_speaker,
	save_path_tc_vc
	)
	# Variant 4 -> Voice Conversion -> Tone Conversion
	encode_message = "@MyShell"
	voice_converter.infer_file(
	src_path,
	reference_speaker,
	save_path_intermediate
	)
	tone_color_converter.convert(
	audio_src_path=save_path_intermediate,
	src_se=source_se,
	tgt_se=target_se,
	output_path=save_path_vc_tc,
	message=encode_message)