Last active
October 17, 2024 11:08
-
-
Save willwade/93e709147d7ce9a5f80a3c3944f8e331 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import objc | |
from AVFoundation import AVSpeechSynthesizer, AVSpeechUtterance, AVSpeechSynthesisVoice, AVAudioEngine, AVAudioFile, AVAudioSession | |
from Foundation import NSURL, NSObject | |
class AVSpeechDriver(NSObject): | |
def __init__(self): | |
self._proxy = None | |
self._tts = None | |
self._audio_engine = AVAudioEngine.alloc().init() | |
self._audio_file = None | |
@objc.python_method | |
def initWithProxy(self, proxy): | |
self = objc.super(AVSpeechDriver, self).init() | |
if self is None: | |
return None | |
self._proxy = proxy | |
self._tts = AVSpeechSynthesizer.alloc().init() | |
self._tts.setDelegate_(self) | |
return self | |
@objc.python_method | |
def say(self, text, ssml=False): | |
self._proxy.setBusy(True) | |
self._current_text = text | |
self._proxy.notify("started-utterance") | |
# Check for SSML support | |
if ssml: | |
utterance = AVSpeechUtterance.ssmlRepresentation_(text) | |
if not utterance: | |
print("Failed to create SSML utterance") | |
return | |
else: | |
utterance = AVSpeechUtterance.speechUtteranceWithString_(text) | |
# Set voice and rate | |
utterance.voice = AVSpeechSynthesisVoice.voiceWithLanguage_("en-US") | |
utterance.rate = 0.5 | |
# Start speaking | |
self._tts.speakUtterance_(utterance) | |
@objc.python_method | |
def save_to_file(self, text, filename, ssml=False): | |
# Configure audio session | |
AVAudioSession.sharedInstance().setCategory_error_("playAndRecord", None) | |
AVAudioSession.sharedInstance().setActive_error_(True, None) | |
# Setup file output | |
output_url = NSURL.fileURLWithPath_(filename) | |
self._audio_file = AVAudioFile.alloc().initForWriting_settings_error_(output_url, self._audio_engine.outputNode.outputFormatForBus_(0).settings(), None) | |
# Configure audio engine | |
self._audio_engine.inputNode.installTapOnBus_bufferSize_format_block_( | |
0, 1024, self._audio_engine.inputNode.outputFormatForBus_(0), | |
lambda buffer, time: self._audio_file.writeFromBuffer_error_(buffer, None) | |
) | |
# Start audio engine and speaking | |
self._audio_engine.prepare() | |
self._audio_engine.startAndReturnError_(None) | |
# Start speaking | |
self.say(text, ssml=ssml) | |
def stop(self): | |
if self._tts.isSpeaking(): | |
self._tts.stopSpeakingAtBoundary_(0) | |
if self._audio_engine.isRunning(): | |
self._audio_engine.stop() | |
self._audio_engine.inputNode.removeTapOnBus_(0) | |
def speechSynthesizer_didFinishSpeaking_(self, synthesizer, success): | |
self._proxy.notify("finished-utterance", completed=success) | |
self._proxy.setBusy(False) | |
# Stop and close audio engine after finishing | |
if self._audio_engine.isRunning(): | |
self._audio_engine.stop() | |
self._audio_engine.inputNode.removeTapOnBus_(0) | |
if self._audio_file: | |
self._audio_file = None | |
def speechSynthesizer_willSpeakRangeOfSpeechString_utterance_(self, synthesizer, characterRange, utterance): | |
# Extract the current word based on character range | |
start, length = characterRange.location, characterRange.length | |
current_word = self._current_text[start : start + length] | |
# Notify proxy of the current word being spoken | |
self._proxy.notify("started-word", name=current_word, location=start, length=length) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment