Jackiexiao/generate_en_us_with_cmu_data_using_modelscope.py

## generate_en_us_with_cmu_data_using_modelscope.py
"""
使用Modelscope开源美式英文TTS生成语音数据，仅供学习分享交流之用，如有侵权，请联系我删除
Use Modelscope's OpenSource TTS to generate English(en-us) speech data, for learning and sharing only, if there is any infringement, please contact me to delete it

下载地址 Download link
https://pan.baidu.com/s/1qUjBhCVknOTV-xm4VBEuDQ?pwd=uqbd

数据示例data example:
annie|annie_LJ001-0002_0.wav|IH0 N #1 B IY1 IH0 NG #1 K AH0 M P EH1 R AH0 T IH0 V L IY0 #1 M AA1 D ER0 N #4|in being comparatively modern.

约40小时，16khz单声道，29912句，一男一女，美式英语
About 40 hours, 16khz mono, 29912 sentences

优点是带了精确的CMU标注，缺点是阿里开源的TTS生成的语音音质不太好，有杂音
The advantage is that it has accurate CMU annotations, but the sound quality of the TTS generated by Ali open source is not very good, with noise

来源Source:
- text: ljspeech
- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_andy_en-us_16k/summary
- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_annie_en-us_16k/summary

Usage:
pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ttsfrd
CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk annie
CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk andy
"""

import re
import json
from pathlib import Path
from scipy.io.wavfile import write
from tqdm import tqdm

from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

# from modelscope.models.audio.tts.sambert_hifi import SambertHifigan
# tts_engine = SambertHifigan(model_dir, am='xx', vocoder = 'xx', lang_type='xx')


def format_ttsfrd_output(res, text):
    phone_list = []
    for sen in res.split("\n"):
        if sen:
            sen_idx, phoneme_infos = sen.split("\t")
            for info_str in phoneme_infos.split():
                # info_str example: '{h_c$tone3$s_begin$word_middle$emotion_neutral$F7}'
                infos = info_str[1:-1].split("$")
                phone = infos[0]
                tone = infos[1].replace("tone", "")

                if phone.startswith("#"):
                    pass
                elif "_c" in phone:  # 中文拼音，以便与英文 cmudict 相区分
                    if re.search(r"[aeiouvr]", phone):
                        if not re.match(r"[0-5]", tone):
                            print(f"非法中文声调 {text} error: {phone} {tone}")
                        phone += tone
                elif re.search(r"g[a-z]", phone):
                    # 单元音的辅音不加声调，例如:
                    # 比如: 嗯: 有 ge$tone_5 en_c$tone_5，还有很多类似的 go ga 等
                    pass
                elif re.search(r"[a-z]", phone):  # 英文音素
                    if re.search(r"[aeiou]", phone):  # 元音加声调
                        if not re.match(r"[0-2]", tone):
                            print(f"非法英文声调 {text} error: {phone} {tone}")
                        phone += tone
                else:
                    print(f"{text} 含有未知音素 error: {info_str}, {phone}, {tone}")
                phone_list.append(phone)
    return " ".join(phone_list)


CMU_PHONEMES = [
    "AA0",
    "AA1",
    "AA2",
    "AE0",
    "AE1",
    "AE2",
    "AH0",
    "AH1",
    "AH2",
    "AO0",
    "AO1",
    "AO2",
    "AW0",
    "AW1",
    "AW2",
    "AY0",
    "AY1",
    "AY2",
    "B",
    "CH",
    "D",
    "DH",
    "EH0",
    "EH1",
    "EH2",
    "ER0",
    "ER1",
    "ER2",
    "EY0",
    "EY1",
    "EY2",
    "F",
    "G",
    "HH",
    "IH0",
    "IH1",
    "IH2",
    "IY0",
    "IY1",
    "IY2",
    "JH",
    "K",
    "L",
    "M",
    "N",
    "NG",
    "OW0",
    "OW1",
    "OW2",
    "OY0",
    "OY1",
    "OY2",
    "P",
    "R",
    "S",
    "SH",
    "T",
    "TH",
    "UH0",
    "UH1",
    "UH2",
    "UW",
    "UW0",
    "UW1",
    "UW2",
    "V",
    "W",
    "Y",
    "Z",
    "ZH",
]
SET_CMU_PHONEMES = set(CMU_PHONEMES)

spk_model_id = {
    "annie": "damo/speech_sambert-hifigan_tts_annie_en-us_16k",
    "andy": "damo/speech_sambert-hifigan_tts_andy_en-us_16k",
}


def clean_text(text: str):
    """删除不表示停顿的 句号"""
    text = text.replace("i.e. ", "IE ")
    text = text.replace("Mr. ", "Mr ")
    text = text.replace("Mrs. ", "Mrs ")
    text = text.replace("Ms. ", "Ms ")
    text = text.replace("Dr. ", "Dr ")
    text = text.replace("Prof. ", "Prof ")
    text = text.replace("St. ", "St ")
    text = text.replace("Sr. ", "Sr ")
    text = text.replace("Jr. ", "Jr ")
    text = text.replace("Maj. ", "Maj ")
    text = text.replace("Gen. ", "Gen ")
    text = text.replace("Col. ", "Col ")
    text = text.replace("Lt. ", "Lt ")
    text = text.replace("Capt. ", "Capt ")
    text = text.replace("Hon. ", "Hon ")
    text = text.replace("Sen. ", "Sen ")
    text = text.replace("Rep. ", "Rep ")
    text = text.replace("Gov. ", "Gov ")
    text = text.replace("U.S.", "US")
    return text


def split_by_punc(text: str, puncs: str = ".!?;"):
    for punc in puncs:
        text = text.replace(punc, punc + "▁")
    return text.split("▁")


def main(args):
    wav_dir = args.outdir / "wavs_16k"
    wav_dir.mkdir(parents=True, exist_ok=True)

    model_id = spk_model_id[args.spk]
    spk = args.spk

    total_lines = []
    with open(args.file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            wav_id, ori_text, ori_text = line.strip().split("|")

            # 按大标点符号分割，因为这里的 tts_engine 拼接句子时中间没有加入停顿
            sub_texts = split_by_punc(clean_text(ori_text))
            sub_texts = [x.strip() for x in sub_texts if x.strip()]

            for j, sub_text in enumerate(sub_texts):
                wav_path = str(wav_dir / f"{spk}_{wav_id}_{j}.wav")
                total_lines.append([wav_path, sub_text])

    tts_engine = pipeline(task=Tasks.text_to_speech, model=model_id)
    frontend = tts_engine.model._SambertHifigan__frontend

    # output = tts_engine(input="hello word text to synthesis")
    # pcm = output[OutputKeys.OUTPUT_PCM]
    # write('test.wav', 16000, pcm)
    # print('Write test.wav')

    ling_info = {}
    data = []

    with open(args.outdir / f"{spk}_ali_en_us_16k.csv", "w", encoding="utf8") as wf:
        for (wav_path, text) in tqdm(total_lines):
            result = tts_engine(input=text)
            wav = result[OutputKeys.OUTPUT_PCM]
            write(wav_path, 16000, wav)

            linguistic_info = frontend.gen_tacotron_symbols(text)
            ling_info[wav_path] = {}
            ling_info[wav_path]["info"] = linguistic_info
            ling_info[wav_path]["text"] = text

            phonemes = format_ttsfrd_output(linguistic_info, text).upper()

            item = [spk, wav_path, phonemes, text]
            data.append(item)
            wf.write("|".join(item) + "\n")

    with open(args.outdir / f"{spk}_ling_info.json", "w", encoding="utf-8") as wf:
        json.dump(ling_info, wf, indent=4, ensure_ascii=False)

    # check the result
    for line in data:
        spk, _, _, wav_path, phonemes, text = line
        for phone in phonemes.split():
            if phone.startswith("#"):
                continue
            if phone not in SET_CMU_PHONEMES:
                print(f"{phone} not in CMU_PHONEMES: {text=}")
                break


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("--file", default=None)
    parser.add_argument("--outdir", default="./temp", type=Path)
    parser.add_argument("--spk", default="annie", type=str)

    args = parser.parse_args()
    main(args)
	"""
	使用Modelscope开源美式英文TTS生成语音数据，仅供学习分享交流之用，如有侵权，请联系我删除
	Use Modelscope's OpenSource TTS to generate English(en-us) speech data, for learning and sharing only, if there is any infringement, please contact me to delete it

	下载地址 Download link
	https://pan.baidu.com/s/1qUjBhCVknOTV-xm4VBEuDQ?pwd=uqbd

	数据示例data example:
	annie\|annie_LJ001-0002_0.wav\|IH0 N #1 B IY1 IH0 NG #1 K AH0 M P EH1 R AH0 T IH0 V L IY0 #1 M AA1 D ER0 N #4\|in being comparatively modern.

	约40小时，16khz单声道，29912句，一男一女，美式英语
	About 40 hours, 16khz mono, 29912 sentences

	优点是带了精确的CMU标注，缺点是阿里开源的TTS生成的语音音质不太好，有杂音
	The advantage is that it has accurate CMU annotations, but the sound quality of the TTS generated by Ali open source is not very good, with noise

	来源Source:
	- text: ljspeech
	- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_andy_en-us_16k/summary
	- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_annie_en-us_16k/summary

	Usage:
	pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
	pip install --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ttsfrd
	CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk annie
	CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk andy
	"""

	import re
	import json
	from pathlib import Path
	from scipy.io.wavfile import write
	from tqdm import tqdm

	from modelscope.outputs import OutputKeys
	from modelscope.pipelines import pipeline
	from modelscope.utils.constant import Tasks

	# from modelscope.models.audio.tts.sambert_hifi import SambertHifigan
	# tts_engine = SambertHifigan(model_dir, am='xx', vocoder = 'xx', lang_type='xx')


	def format_ttsfrd_output(res, text):
	phone_list = []
	for sen in res.split("\n"):
	if sen:
	sen_idx, phoneme_infos = sen.split("\t")
	for info_str in phoneme_infos.split():
	# info_str example: '{h_c$tone3$s_begin$word_middle$emotion_neutral$F7}'
	infos = info_str[1:-1].split("$")
	phone = infos[0]
	tone = infos[1].replace("tone", "")

	if phone.startswith("#"):
	pass
	elif "_c" in phone: # 中文拼音，以便与英文 cmudict 相区分
	if re.search(r"[aeiouvr]", phone):
	if not re.match(r"[0-5]", tone):
	print(f"非法中文声调 {text} error: {phone} {tone}")
	phone += tone
	elif re.search(r"g[a-z]", phone):
	# 单元音的辅音不加声调，例如:
	# 比如: 嗯: 有 ge$tone_5 en_c$tone_5，还有很多类似的 go ga 等
	pass
	elif re.search(r"[a-z]", phone): # 英文音素
	if re.search(r"[aeiou]", phone): # 元音加声调
	if not re.match(r"[0-2]", tone):
	print(f"非法英文声调 {text} error: {phone} {tone}")
	phone += tone
	else:
	print(f"{text} 含有未知音素 error: {info_str}, {phone}, {tone}")
	phone_list.append(phone)
	return " ".join(phone_list)


	CMU_PHONEMES = [
	"AA0",
	"AA1",
	"AA2",
	"AE0",
	"AE1",
	"AE2",
	"AH0",
	"AH1",
	"AH2",
	"AO0",
	"AO1",
	"AO2",
	"AW0",
	"AW1",
	"AW2",
	"AY0",
	"AY1",
	"AY2",
	"B",
	"CH",
	"D",
	"DH",
	"EH0",
	"EH1",
	"EH2",
	"ER0",
	"ER1",
	"ER2",
	"EY0",
	"EY1",
	"EY2",
	"F",
	"G",
	"HH",
	"IH0",
	"IH1",
	"IH2",
	"IY0",
	"IY1",
	"IY2",
	"JH",
	"K",
	"L",
	"M",
	"N",
	"NG",
	"OW0",
	"OW1",
	"OW2",
	"OY0",
	"OY1",
	"OY2",
	"P",
	"R",
	"S",
	"SH",
	"T",
	"TH",
	"UH0",
	"UH1",
	"UH2",
	"UW",
	"UW0",
	"UW1",
	"UW2",
	"V",
	"W",
	"Y",
	"Z",
	"ZH",
	]
	SET_CMU_PHONEMES = set(CMU_PHONEMES)

	spk_model_id = {
	"annie": "damo/speech_sambert-hifigan_tts_annie_en-us_16k",
	"andy": "damo/speech_sambert-hifigan_tts_andy_en-us_16k",
	}


	def clean_text(text: str):
	"""删除不表示停顿的句号"""
	text = text.replace("i.e. ", "IE ")
	text = text.replace("Mr. ", "Mr ")
	text = text.replace("Mrs. ", "Mrs ")
	text = text.replace("Ms. ", "Ms ")
	text = text.replace("Dr. ", "Dr ")
	text = text.replace("Prof. ", "Prof ")
	text = text.replace("St. ", "St ")
	text = text.replace("Sr. ", "Sr ")
	text = text.replace("Jr. ", "Jr ")
	text = text.replace("Maj. ", "Maj ")
	text = text.replace("Gen. ", "Gen ")
	text = text.replace("Col. ", "Col ")
	text = text.replace("Lt. ", "Lt ")
	text = text.replace("Capt. ", "Capt ")
	text = text.replace("Hon. ", "Hon ")
	text = text.replace("Sen. ", "Sen ")
	text = text.replace("Rep. ", "Rep ")
	text = text.replace("Gov. ", "Gov ")
	text = text.replace("U.S.", "US")
	return text


	def split_by_punc(text: str, puncs: str = ".!?;"):
	for punc in puncs:
	text = text.replace(punc, punc + "▁")
	return text.split("▁")


	def main(args):
	wav_dir = args.outdir / "wavs_16k"
	wav_dir.mkdir(parents=True, exist_ok=True)

	model_id = spk_model_id[args.spk]
	spk = args.spk

	total_lines = []
	with open(args.file, "r", encoding="utf-8") as f:
	lines = f.readlines()
	for i, line in enumerate(lines):
	wav_id, ori_text, ori_text = line.strip().split("\|")

	# 按大标点符号分割，因为这里的 tts_engine 拼接句子时中间没有加入停顿
	sub_texts = split_by_punc(clean_text(ori_text))
	sub_texts = [x.strip() for x in sub_texts if x.strip()]

	for j, sub_text in enumerate(sub_texts):
	wav_path = str(wav_dir / f"{spk}_{wav_id}_{j}.wav")
	total_lines.append([wav_path, sub_text])

	tts_engine = pipeline(task=Tasks.text_to_speech, model=model_id)
	frontend = tts_engine.model._SambertHifigan__frontend

	# output = tts_engine(input="hello word text to synthesis")
	# pcm = output[OutputKeys.OUTPUT_PCM]
	# write('test.wav', 16000, pcm)
	# print('Write test.wav')

	ling_info = {}
	data = []

	with open(args.outdir / f"{spk}_ali_en_us_16k.csv", "w", encoding="utf8") as wf:
	for (wav_path, text) in tqdm(total_lines):
	result = tts_engine(input=text)
	wav = result[OutputKeys.OUTPUT_PCM]
	write(wav_path, 16000, wav)

	linguistic_info = frontend.gen_tacotron_symbols(text)
	ling_info[wav_path] = {}
	ling_info[wav_path]["info"] = linguistic_info
	ling_info[wav_path]["text"] = text

	phonemes = format_ttsfrd_output(linguistic_info, text).upper()

	item = [spk, wav_path, phonemes, text]
	data.append(item)
	wf.write("\|".join(item) + "\n")

	with open(args.outdir / f"{spk}_ling_info.json", "w", encoding="utf-8") as wf:
	json.dump(ling_info, wf, indent=4, ensure_ascii=False)

	# check the result
	for line in data:
	spk, _, _, wav_path, phonemes, text = line
	for phone in phonemes.split():
	if phone.startswith("#"):
	continue
	if phone not in SET_CMU_PHONEMES:
	print(f"{phone} not in CMU_PHONEMES: {text=}")
	break


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()

	parser.add_argument("--file", default=None)
	parser.add_argument("--outdir", default="./temp", type=Path)
	parser.add_argument("--spk", default="annie", type=str)

	args = parser.parse_args()
	main(args)