Skip to content

Instantly share code, notes, and snippets.

@Ending2015a
Created September 16, 2018 03:41
Show Gist options
  • Save Ending2015a/24951808e947348ef9a7d5bdb659ae9f to your computer and use it in GitHub Desktop.
Save Ending2015a/24951808e947348ef9a7d5bdb659ae9f to your computer and use it in GitHub Desktop.
A simple text-to-speech script to recite English vocabulary & Chinese definition using Microsoft SAPI
# you should install pypiwin32 first: pip install pypiwin32
import win32com.client
import re
import os
config = {'delimiter':'\n\n', # delimiter between each word
'group': 3,
'pattern': '([^\t]+)\t([^\n]+)\n?([\s\S]*)', # group 0 =Vocab / group 1 =Chinese Definition / group 2 =English Definition
'voice': [1, 0, 1], # the voice, used to each group
'volume': 100,
'spelling': 0, # spell group 0
'sequence': [0, 0, 's', 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, 's', 1, 2, 0, 0], # 0 =Vocab / 1 =Chinese Def / 2 =English Def / s =spelling
'speed': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
}
'''
if you want to check how many kinds of voices your system has:
---
speaker = win32com.client.Dispatch('SAPI.SpVoice')
voices = speaker.GetVoices()
print('there are ', voices.Count, ' kinds of voices in your system')
for i in range(voices.Count):
print(i, ': voices.Item(i).GetDescription()')
---
BTW, my system has 2 kinds of voices
0 : Microsoft Hanhan Desktop - Chinese (Taiwan)
1 : Microsoft Zira Desktop - English (United States)
'''
output_filename = 'english_vocabulary_001.wav'
# content
# format: [Eng Vocab] \t [Chi Def] \n [remain... Eng Def]
text = u'''insolent\t粗魯無禮的、傲慢的
adj. extremely rude, esp in expressing contempt
conspicuous\t顯著的、顯而易見的
adj. easily seen; noticeable; remarkable
distortion\t扭曲、曲解
n. distorting or being distorted
'''
# split vocab & def
def get_wordsets(text, config=config):
config['pat_comp'] = re.compile(config['pattern']) # compile regular expression
words = text.split(config['delimiter']) # split each word by dellimiter
print('Total: ', len(words), 'words')
word_sets = []
for w in words:
m = config['pat_comp'].match(w) # match pattern
try:
wset = [m.group(n+1) for n in range(config['group'])] # split word & definitions
word_sets.append(wset)
except:
print('no match: ' + w) # if word no match
pass
return word_sets, config
# text-to-speech
def create_wav(filename, word_sets, config=config):
stream = win32com.client.Dispatch('SAPI.SpFileStream') # create file stream
speaker = win32com.client.Dispatch('SAPI.SpVoice') # create text-to-speech handle
if os.path.isfile(filename): # check if exist -> remove
print('file existed: ', filename)
os.remove(filename)
stream.Open(filename, 3) # create & write file / 3 = SSFMCreateForWrite
speaker.AudioOutputStream = stream # assign file stream to output stream
speaker.Volume = config['volume'] # set volume
voices = speaker.GetVoices() # get voices
def speak_once(text, volume, rate, voice):
speaker.Voice = voices.Item(voice)
speaker.Volume = volume
speaker.Rate = rate
speaker.Speak(text)
speaker.WaitUntilDone(-1) # -1= infinit timeout limit
# print out voices info
print('Voices: ', voices.Count)
for i in range(voices.Count):
print(i,': ' + voices.Item(i).GetDescription())
for w in word_sets: # for each word
spell = ', '.join(list(w[config['spelling']])) #spelling
for idx, s in enumerate(config['sequence']): # get sequence
voice = s if s != 's' else config['spelling'] # get group index
speak_word = spell if s == 's' else w[s] # get word to be speak
speak_word += ', ' # halt
speak_voice = config['voice'][voice] # get voice index
speak_rate = config['speed'][idx] # speech speed
speak_once(speak_word, config['volume'], speak_rate, speak_voice)
speaker.WaitUntilDone(-1)
stream.Close() # close stream
word_sets, _ = get_wordsets(text)
create_wav(output_filename, word_sets, config)
# if you want to convert to mp3 file
# you can use ffmpeg and type commands as following:
# ffmpeg -i "input.wav" -vn -ar 44100 -ac 2 -ab 192k -af "volume=1.5" -f mp3 "output.mp3"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment