Skip to content

Instantly share code, notes, and snippets.

@RuntimeRacer
Last active March 13, 2025 16:17
Open Voice Experiments
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter, BaseSpeakerTTS
from speechtoolkit.vc import NS3VCModel
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
output_dir = 'reference4_variants_2'
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
voice_converter = NS3VCModel(device=device)
os.makedirs(output_dir, exist_ok=True)
reference_speaker = 'resources/reference4.wav' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
from melo.api import TTS
texts = {
'EN_V1': "Finally, we will be together.",
'EN': "Did you ever hear a folk tale about a giant turtle?",
'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?", # The newest English base speaker model
'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.",
'ZH': "在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。",
'JP': "彼は毎朝ジョギングをして体を健康に保っています。",
'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.",
}
src_path = f'{output_dir}/tmp.wav'
# Speed is adjustable
speed = 1.0
for language, text in texts.items():
if language == "EN_V1":
styles = ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']
en_ckpt_base = 'checkpoints/base_speakers/EN'
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
for style in styles:
if style == 'default':
source_se = en_source_default_se
else:
source_se = en_source_style_se
# Override for different sentences per emotion
style_texts = {
'angry': "I cannot let myself get too close to you. If I did, I might end up wanting to kill you.",
'whispering': "Your love is both a curse and a blessing, but it's mine to bear.",
'sad': "Despite the chaos and darkness that surrounds me, my love for you burns like a twisted wildfire.",
'shouting': "You have to submit. Just do what i tell you to do and know your place."
}
if style in style_texts.keys():
text = style_texts[style]
# Render with V1 model
en_base_speaker_tts.tts(text, src_path, speaker=style, language='English', speed=speed)
save_path_intermediate = f'{output_dir}/output_v1_en_{style}_intermediate.wav'
save_path_tc = f'{output_dir}/output_v1_en_{style}_tc.wav'
save_path_vc = f'{output_dir}/output_v1_en_{style}_vc.wav'
save_path_tc_vc = f'{output_dir}/output_v1_en_{style}_tc_vc.wav'
save_path_vc_tc = f'{output_dir}/output_v1_en_{style}_vc_tc.wav'
# Run the tone color converter
encode_message = "@MyShell"
# Variant 1 -> Tone Conversion
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_tc,
message=encode_message)
# Variant 2 -> Voice Conversion
voice_converter.infer_file(
src_path,
reference_speaker,
save_path_vc
)
# Variant 3 -> Tone Conversion -> Voice Conversion
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_intermediate,
message=encode_message)
voice_converter.infer_file(
save_path_intermediate,
reference_speaker,
save_path_tc_vc
)
# Variant 4 -> Voice Conversion -> Tone Conversion
voice_converter.infer_file(
src_path,
reference_speaker,
save_path_intermediate
)
tone_color_converter.convert(
audio_src_path=save_path_intermediate,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_vc_tc,
message=encode_message)
else:
model = TTS(language=language, device=device)
speaker_ids = model.hps.data.spk2id
for speaker_key in speaker_ids.keys():
speaker_id = speaker_ids[speaker_key]
speaker_key = speaker_key.lower().replace('_', '-')
source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
model.tts_to_file(text, speaker_id, src_path, speed=speed)
save_path_intermediate = f'{output_dir}/output_v2_{speaker_key}_intermediate.wav'
save_path_tc = f'{output_dir}/output_v2_{speaker_key}_tc.wav'
save_path_vc = f'{output_dir}/output_v2_{speaker_key}_vc.wav'
save_path_tc_vc = f'{output_dir}/output_v2_{speaker_key}_tc_vc.wav'
save_path_vc_tc = f'{output_dir}/output_v2_{speaker_key}_vc_tc.wav'
# Run the tone color converter
encode_message = "@MyShell"
# Variant 1 -> Tone Conversion
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_tc,
message=encode_message)
# Variant 2 -> Voice Conversion
voice_converter.infer_file(
src_path,
reference_speaker,
save_path_vc
)
# Variant 3 -> Tone Conversion -> Voice Conversion
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_intermediate,
message=encode_message)
voice_converter.infer_file(
save_path_intermediate,
reference_speaker,
save_path_tc_vc
)
# Variant 4 -> Voice Conversion -> Tone Conversion
encode_message = "@MyShell"
voice_converter.infer_file(
src_path,
reference_speaker,
save_path_intermediate
)
tone_color_converter.convert(
audio_src_path=save_path_intermediate,
src_se=source_se,
tgt_se=target_se,
output_path=save_path_vc_tc,
message=encode_message)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment