A Python sample code for docomo 音声合成API (https://dev.smt.docomo.ne.jp/?p=docs.api.page&api_name=text_to_speech&p_name=api_reference)
#!/usr/bin/env python | |
import requests | |
import ffmpeg | |
import subprocess | |
class DTALK: | |
""" | |
This scripts depends on 'aplay' command | |
""" | |
def __init__(self, key, param_dict=None, tmp_dir="./tmp"): | |
self.key = key | |
self.url = "https://api.apigw.smt.docomo.ne.jp/aiTalk/v1/textToSpeech?APIKEY=" + key | |
self.tmp_dir = tmp_dir | |
if param_dict: | |
for req_elements in ['speaker', 'rate', 'pitch', 'range', 'volume']: | |
if req_elements not in param_dict: | |
raise ValueError( | |
"The element " + req_elements + " was not found. It must be included.") | |
if not isinstance(param_dict[req_elements], str): | |
raise ValueError("It must be string") | |
self.param_dict = param_dict | |
else: | |
self.param_dict = { | |
'speaker': 'sumire', | |
'pitch': '1.2', | |
'range': '1', | |
'rate': '1.3', | |
'volume': '2.0' | |
} | |
def generate_xml(self, text): | |
xml = '<?xml version="1.0" encoding="utf-8" ?>' | |
voice = '<voice name="' + self.param_dict["speaker"] + '">' | |
prosody = '<prosody rate="' + \ | |
self.param_dict["rate"] + '" pitch="' + self.param_dict["pitch"] + \ | |
'" range="' + self.param_dict["range"] + \ | |
'" volume="' + self.param_dict["volume"] + '">' | |
xml += '<speak version="1.1">' + voice + prosody + text + \ | |
'</prosody></voice></speak>' | |
return xml.encode("UTF-8") | |
def pcm2wav(self, pcm_path): | |
stream = ffmpeg.input(pcm_path, f="s16be", ar="16000", ac="1") | |
stream = ffmpeg.output(stream, self.tmp_dir + 'output.wav', loglevel=0) | |
stream = ffmpeg.overwrite_output(stream) | |
ffmpeg.run(stream) | |
def get_wav(self, text): | |
if not text: | |
text = "ちょっと何言っているかわからないですね!" | |
xml = self.generate_xml(text) | |
response = requests.post( | |
self.url, | |
data=xml, | |
headers={ | |
'Content-Type': 'application/ssml+xml', | |
'Accept': 'audio/L16', | |
'Content-Length': str(len(xml)) | |
}) | |
if response.status_code != 200: | |
return | |
return response.content | |
def talk(self, text): | |
content = self.get_wav(text) | |
if not content: | |
return | |
with open(self.tmp_dir + "raw_file", 'wb') as f: | |
f.write(content) | |
self.pcm2wav(self.tmp_dir + "raw_file") | |
cmd = "aplay -q " + self.tmp_dir + "output.wav" | |
subprocess.call(cmd, shell=True) | |
def main(): | |
API_KEY = "<your-api-key-goes-here>" | |
text = "今日はいい天気です。お腹が空きましたか?" | |
param_dict = { | |
'speaker': 'sumire', | |
'pitch': '1', | |
'range': '1', | |
'rate': '2.0', | |
'volume': '2.0' | |
} | |
dtalk = DTALK(key=API_KEY, param_dict=param_dict) | |
dtalk.talk(text) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment