Skip to content

Instantly share code, notes, and snippets.

@masadchattha
Created March 7, 2023 19:36
Show Gist options
  • Save masadchattha/bb2c6257c90d8cab5fa6b98061381ab5 to your computer and use it in GitHub Desktop.
Save masadchattha/bb2c6257c90d8cab5fa6b98061381ab5 to your computer and use it in GitHub Desktop.
Transcribing Live Audio Speech to text in SwiftUI.
/*
Transcribing speech to text
Captures and logs meeting transcripts.
You’ll request access to device hardware like the microphone and
integrate the Speech framework to transcribe live audio to text.
*/
import AVFoundation
import Foundation
import Speech
import SwiftUI
/// A helper for transcribing speech to text using SFSpeechRecognizer and AVAudioEngine.
class SpeechRecognizer: ObservableObject {
enum RecognizerError: Error {
case nilRecognizer
case notAuthorizedToRecognize
case notPermittedToRecord
case recognizerIsUnavailable
var message: String {
switch self {
case .nilRecognizer: return "Can't initialize speech recognizer"
case .notAuthorizedToRecognize: return "Not authorized to recognize speech"
case .notPermittedToRecord: return "Not permitted to record audio"
case .recognizerIsUnavailable: return "Recognizer is unavailable"
}
}
}
@Published var transcript: String = ""
private var audioEngine: AVAudioEngine?
private var request: SFSpeechAudioBufferRecognitionRequest?
private var task: SFSpeechRecognitionTask?
private let recognizer: SFSpeechRecognizer?
/**
Initializes a new speech recognizer. If this is the first time you've used the class, it
requests access to the speech recognizer and the microphone.
*/
init() {
recognizer = SFSpeechRecognizer()
Task(priority: .background) {
do {
guard recognizer != nil else {
throw RecognizerError.nilRecognizer
}
guard await SFSpeechRecognizer.hasAuthorizationToRecognize() else {
throw RecognizerError.notAuthorizedToRecognize
}
guard await AVAudioSession.sharedInstance().hasPermissionToRecord() else {
throw RecognizerError.notPermittedToRecord
}
} catch {
speakError(error)
}
}
}
/*init() {
recognizer = SFSpeechRecognizer()
do {
guard recognizer != nil else {
throw RecognizerError.nilRecognizer
}
guard SFSpeechRecognizer.hasAuthorizationToRecognize() else {
throw RecognizerError.notAuthorizedToRecognize
}
guard AVAudioSession.sharedInstance().hasPermissionToRecord() else {
throw RecognizerError.notPermittedToRecord
}
} catch {
speakError(error)
}
}*/
deinit {
reset()
}
/**
Begin transcribing audio.
Creates a `SFSpeechRecognitionTask` that transcribes speech to text until you call `stopTranscribing()`.
The resulting transcription is continuously written to the published `transcript` property.
*/
func transcribe() {
DispatchQueue(label: "Speech Recognizer Queue", qos: .background).async { [weak self] in
guard let self = self, let recognizer = self.recognizer, recognizer.isAvailable else {
self?.speakError(RecognizerError.recognizerIsUnavailable)
return
}
do {
let (audioEngine, request) = try Self.prepareEngine()
self.audioEngine = audioEngine
self.request = request
self.task = recognizer.recognitionTask(with: request) { result, error in
let receivedFinalResult = result?.isFinal ?? false
let receivedError = error != nil // != nil mean there's error (true)
if receivedFinalResult || receivedError {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
}
if let result = result {
self.speak(result.bestTranscription.formattedString)
}
}
} catch {
self.reset()
self.speakError(error)
}
}
}
/// Stop transcribing audio.
func stopTranscribing() {
reset()
}
/// Reset the speech recognizer.
func reset() {
task?.cancel()
audioEngine?.stop()
audioEngine = nil
request = nil
task = nil
}
private static func prepareEngine() throws -> (AVAudioEngine, SFSpeechAudioBufferRecognitionRequest) {
let audioEngine = AVAudioEngine()
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) {
(buffer: AVAudioPCMBuffer, when: AVAudioTime) in
request.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
return (audioEngine, request)
}
private func speak(_ message: String) {
transcript = message
}
private func speakError(_ error: Error) {
var errorMessage = ""
if let error = error as? RecognizerError {
errorMessage += error.message
} else {
errorMessage += error.localizedDescription
}
transcript = "<< \(errorMessage) >>"
}
}
/*extension SFSpeechRecognizer {
static func hasAuthorizationToRecognize() -> Bool {
var isAuthorized = false
requestAuthorization { status in
isAuthorized = status == .authorized
}
return isAuthorized
}
}
extension AVAudioSession {
func hasPermissionToRecord() -> Bool {
var isPermitted = false
requestRecordPermission { authorized in
isPermitted = authorized
}
return isPermitted
}
}*/
extension SFSpeechRecognizer {
static func hasAuthorizationToRecognize() async -> Bool {
await withCheckedContinuation { continuation in
requestAuthorization { status in
continuation.resume(returning: status == .authorized)
}
}
}
}
extension AVAudioSession {
func hasPermissionToRecord() async -> Bool {
await withCheckedContinuation { continuation in
requestRecordPermission { authorized in
continuation.resume(returning: authorized)
}
}
}
}
// MARK: - Example to Call Above code for Speech Recognizing
@StateObject var speechRecognizer = SpeechRecognizer()
//
//
//
ZStack {
//
//
//
}
.onAppear {
// Transcripting Speech
speechRecognizer.reset()
speechRecognizer.transcribe()
}
.onDisappear {
// Stop Transcripting
speechRecognizer.stopTranscribing()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment