Last active
March 13, 2025 16:17
Open Voice Experiments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
from openvoice import se_extractor | |
from openvoice.api import ToneColorConverter, BaseSpeakerTTS | |
from speechtoolkit.vc import NS3VCModel | |
ckpt_converter = 'checkpoints_v2/converter' | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
#device = "cpu" | |
output_dir = 'reference4_variants_2' | |
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) | |
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') | |
voice_converter = NS3VCModel(device=device) | |
os.makedirs(output_dir, exist_ok=True) | |
reference_speaker = 'resources/reference4.wav' # This is the voice you want to clone | |
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False) | |
from melo.api import TTS | |
texts = { | |
'EN_V1': "Finally, we will be together.", | |
'EN': "Did you ever hear a folk tale about a giant turtle?", | |
'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?", # The newest English base speaker model | |
'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.", | |
'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.", | |
'ZH': "在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。", | |
'JP': "彼は毎朝ジョギングをして体を健康に保っています。", | |
'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.", | |
} | |
src_path = f'{output_dir}/tmp.wav' | |
# Speed is adjustable | |
speed = 1.0 | |
for language, text in texts.items(): | |
if language == "EN_V1": | |
styles = ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'] | |
en_ckpt_base = 'checkpoints/base_speakers/EN' | |
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device) | |
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device) | |
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device) | |
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth') | |
for style in styles: | |
if style == 'default': | |
source_se = en_source_default_se | |
else: | |
source_se = en_source_style_se | |
# Override for different sentences per emotion | |
style_texts = { | |
'angry': "I cannot let myself get too close to you. If I did, I might end up wanting to kill you.", | |
'whispering': "Your love is both a curse and a blessing, but it's mine to bear.", | |
'sad': "Despite the chaos and darkness that surrounds me, my love for you burns like a twisted wildfire.", | |
'shouting': "You have to submit. Just do what i tell you to do and know your place." | |
} | |
if style in style_texts.keys(): | |
text = style_texts[style] | |
# Render with V1 model | |
en_base_speaker_tts.tts(text, src_path, speaker=style, language='English', speed=speed) | |
save_path_intermediate = f'{output_dir}/output_v1_en_{style}_intermediate.wav' | |
save_path_tc = f'{output_dir}/output_v1_en_{style}_tc.wav' | |
save_path_vc = f'{output_dir}/output_v1_en_{style}_vc.wav' | |
save_path_tc_vc = f'{output_dir}/output_v1_en_{style}_tc_vc.wav' | |
save_path_vc_tc = f'{output_dir}/output_v1_en_{style}_vc_tc.wav' | |
# Run the tone color converter | |
encode_message = "@MyShell" | |
# Variant 1 -> Tone Conversion | |
tone_color_converter.convert( | |
audio_src_path=src_path, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_tc, | |
message=encode_message) | |
# Variant 2 -> Voice Conversion | |
voice_converter.infer_file( | |
src_path, | |
reference_speaker, | |
save_path_vc | |
) | |
# Variant 3 -> Tone Conversion -> Voice Conversion | |
tone_color_converter.convert( | |
audio_src_path=src_path, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_intermediate, | |
message=encode_message) | |
voice_converter.infer_file( | |
save_path_intermediate, | |
reference_speaker, | |
save_path_tc_vc | |
) | |
# Variant 4 -> Voice Conversion -> Tone Conversion | |
voice_converter.infer_file( | |
src_path, | |
reference_speaker, | |
save_path_intermediate | |
) | |
tone_color_converter.convert( | |
audio_src_path=save_path_intermediate, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_vc_tc, | |
message=encode_message) | |
else: | |
model = TTS(language=language, device=device) | |
speaker_ids = model.hps.data.spk2id | |
for speaker_key in speaker_ids.keys(): | |
speaker_id = speaker_ids[speaker_key] | |
speaker_key = speaker_key.lower().replace('_', '-') | |
source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device) | |
model.tts_to_file(text, speaker_id, src_path, speed=speed) | |
save_path_intermediate = f'{output_dir}/output_v2_{speaker_key}_intermediate.wav' | |
save_path_tc = f'{output_dir}/output_v2_{speaker_key}_tc.wav' | |
save_path_vc = f'{output_dir}/output_v2_{speaker_key}_vc.wav' | |
save_path_tc_vc = f'{output_dir}/output_v2_{speaker_key}_tc_vc.wav' | |
save_path_vc_tc = f'{output_dir}/output_v2_{speaker_key}_vc_tc.wav' | |
# Run the tone color converter | |
encode_message = "@MyShell" | |
# Variant 1 -> Tone Conversion | |
tone_color_converter.convert( | |
audio_src_path=src_path, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_tc, | |
message=encode_message) | |
# Variant 2 -> Voice Conversion | |
voice_converter.infer_file( | |
src_path, | |
reference_speaker, | |
save_path_vc | |
) | |
# Variant 3 -> Tone Conversion -> Voice Conversion | |
tone_color_converter.convert( | |
audio_src_path=src_path, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_intermediate, | |
message=encode_message) | |
voice_converter.infer_file( | |
save_path_intermediate, | |
reference_speaker, | |
save_path_tc_vc | |
) | |
# Variant 4 -> Voice Conversion -> Tone Conversion | |
encode_message = "@MyShell" | |
voice_converter.infer_file( | |
src_path, | |
reference_speaker, | |
save_path_intermediate | |
) | |
tone_color_converter.convert( | |
audio_src_path=save_path_intermediate, | |
src_se=source_se, | |
tgt_se=target_se, | |
output_path=save_path_vc_tc, | |
message=encode_message) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment