anuragmishra1/microsoft_speech_api_tts.py

## microsoft_speech_api_tts.py
import http.client
import urllib.parse
import json
from xml.etree import ElementTree
import os
import sys
import wave
import time
from datetime import timedelta
import argparse


#We need to get our API credentials in the code for authentication that we have stored as Environment Variables locally
Ocp_Apim_Subscription_Key = os.environ.get("KEY_SPEECH")

#Following line is used to save all the console output into a text file
sys.stdout = open('speech_api_test_text_tts_output.txt', 'a')

start_time = time.monotonic()


def input_file(text_file_path):
    global text
    if os.path.isfile(text_file_path):
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()
    else:
        print("File doesn't exist in the directory!")


def speech_tts():
    params = ""
    headers = {
        # NOTE: Replace the "Ocp-Apim-Subscription-Key" value with a valid subscription key.
        'Ocp-Apim-Subscription-Key': Ocp_Apim_Subscription_Key,
    }

    AccessTokenHost = "api.cognitive.microsoft.com"
    path = "/sts/v1.0/issueToken"

    # Connect to server to get the Access Token
    print("Connect to server to get the Access Token")
    conn = http.client.HTTPSConnection(AccessTokenHost)
    conn.request("POST", path, params, headers)
    response = conn.getresponse()
    print(response.status, response.reason)

    data = response.read()
    conn.close()

    accesstoken = data.decode("UTF-8")

    body = ElementTree.Element('speak', version = '1.0')
    body.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-us')
    voice = ElementTree.SubElement(body, 'voice')
    voice.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
    voice.set('{http://www.w3.org/XML/1998/namespace}gender', 'Female')
    voice.set('name', 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)')
    voice.text = text

    headers = {"Content-type": "application/ssml+xml",
                "X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm",
                "Authorization": "Bearer " + accesstoken,
                "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
                "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
                "User-Agent": "TTSForPython"}

    #Connect to server to synthesize the wave
    print("\nConnect to server to synthesize the wave")
    conn = http.client.HTTPSConnection("speech.platform.bing.com")
    conn.request("POST", "/synthesize", ElementTree.tostring(body), headers)
    response = conn.getresponse()
    print(response.status, response.reason)

    data = response.read()
    conn.close()
    print("The synthesized wave length: %d" %(len(data)))

    if len(text) >= 3:
        wf = wave.open('bing_test_text_tts.wav', 'wb')
        wf.setframerate(16000)
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.writeframes(data)
        wf.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description = __doc__,
        formatter_class = argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'text_file_path',
        help = 'The complete file path of the text file you want to convert from text to speech.')
    args = parser.parse_args()

    input_file(args.text_file_path)
    speech_tts()


end_time = time.monotonic()
print("Execution_Time:", timedelta(seconds = end_time - start_time))
print('\n')
	import http.client
	import urllib.parse
	import json
	from xml.etree import ElementTree
	import os
	import sys
	import wave
	import time
	from datetime import timedelta
	import argparse


	#We need to get our API credentials in the code for authentication that we have stored as Environment Variables locally
	Ocp_Apim_Subscription_Key = os.environ.get("KEY_SPEECH")

	#Following line is used to save all the console output into a text file
	sys.stdout = open('speech_api_test_text_tts_output.txt', 'a')

	start_time = time.monotonic()


	def input_file(text_file_path):
	global text
	if os.path.isfile(text_file_path):
	with open(text_file_path, 'r') as text_file:
	text = text_file.read()
	else:
	print("File doesn't exist in the directory!")


	def speech_tts():
	params = ""
	headers = {
	# NOTE: Replace the "Ocp-Apim-Subscription-Key" value with a valid subscription key.
	'Ocp-Apim-Subscription-Key': Ocp_Apim_Subscription_Key,
	}

	AccessTokenHost = "api.cognitive.microsoft.com"
	path = "/sts/v1.0/issueToken"

	# Connect to server to get the Access Token
	print("Connect to server to get the Access Token")
	conn = http.client.HTTPSConnection(AccessTokenHost)
	conn.request("POST", path, params, headers)
	response = conn.getresponse()
	print(response.status, response.reason)

	data = response.read()
	conn.close()

	accesstoken = data.decode("UTF-8")

	body = ElementTree.Element('speak', version = '1.0')
	body.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-us')
	voice = ElementTree.SubElement(body, 'voice')
	voice.set('{http://www.w3.org/XML/1998/namespace}lang', 'en-US')
	voice.set('{http://www.w3.org/XML/1998/namespace}gender', 'Female')
	voice.set('name', 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)')
	voice.text = text

	headers = {"Content-type": "application/ssml+xml",
	"X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm",
	"Authorization": "Bearer " + accesstoken,
	"X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
	"X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
	"User-Agent": "TTSForPython"}

	#Connect to server to synthesize the wave
	print("\nConnect to server to synthesize the wave")
	conn = http.client.HTTPSConnection("speech.platform.bing.com")
	conn.request("POST", "/synthesize", ElementTree.tostring(body), headers)
	response = conn.getresponse()
	print(response.status, response.reason)

	data = response.read()
	conn.close()
	print("The synthesized wave length: %d" %(len(data)))

	if len(text) >= 3:
	wf = wave.open('bing_test_text_tts.wav', 'wb')
	wf.setframerate(16000)
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.writeframes(data)
	wf.close()


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description = __doc__,
	formatter_class = argparse.RawDescriptionHelpFormatter)
	parser.add_argument(
	'text_file_path',
	help = 'The complete file path of the text file you want to convert from text to speech.')
	args = parser.parse_args()

	input_file(args.text_file_path)
	speech_tts()


	end_time = time.monotonic()
	print("Execution_Time:", timedelta(seconds = end_time - start_time))
	print('\n')