Skip to content

Instantly share code, notes, and snippets.

@ttunguz
Created December 15, 2023 05:50
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ttunguz/686bb1e28f2df4e66373cdf57bf61627 to your computer and use it in GitHub Desktop.
Save ttunguz/686bb1e28f2df4e66373cdf57bf61627 to your computer and use it in GitHub Desktop.
Dictation with Whisper.cpp
from pynput.keyboard import Key, Listener, Controller
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
import subprocess
import os
import re
import datetime
# Set the sample rate to 16 kHz
sample_rate = 16000
recording = False
filename_prefix = "~/Documents/transcripts/"
keyboard = Controller()
def on_press(key):
global recording_audio
recording_audio = [] # Reset
global recording
if key == Key.alt_l:
recording = True
print("Starting recording")
def on_release(key):
global recording
if key == Key.alt_l:
recording = False
print("Stopped recording")
sd.wait()
if recording_audio:
# create a filename called output & the current date&time to the millisecond
now = datetime.datetime.now()
date_time_str = now.strftime("%Y%m%d-%H%M%S-%f")
filename = filename_prefix + "output_" + date_time_str + ".wav"
# Specify the output file path for the processed text
output_file = filename_prefix + "output_" + date_time_str + ".txt"
output_file_main = filename_prefix + "output_" + date_time_str
# Convert to numpy array and normalize to int16
recording_np = np.concatenate(recording_audio, axis=0)
recording_np = np.int16(recording_np * np.iinfo(np.int16).max)
# Write to wav file
write(filename, sample_rate, recording_np)
process_audio(filename, output_file, output_file_main)
else:
print("No audio recorded")
def recording_callback(indata, frames, time, status):
if recording:
recording_audio.append(indata.copy())
def process_audio(filename, output_file, output_file_main):
print("Processing audio")
# run the command with the above options and also specify the medium model using this syntax -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
# subprocess.run(["./main", "-otxt", "--output-file", output_file_main, "-m", "models/ggml-small.en-q5_1.bin", filename])
subprocess.run(["./main", "-otxt", "--output-file", output_file_main, "-m", "models/ggml-small.en-q5_1.bin", "--split-on-word", filename])
print(output_file)
# Read the processed text from the file
if os.path.exists(output_file):
with open(output_file, 'r') as file:
print("opening : " + output_file)
processed_text = file.read().strip()
# Remove timestamps from the processed text
processed_text = re.sub(r'\[\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\]', '', processed_text)
# remove any text contained within square brackets
processed_text = re.sub(r'\[.*?\]', '', processed_text)
# remove any text contained within parentheses
processed_text = re.sub(r'\(.*?\)', '', processed_text)
# remove line breaks
processed_text = processed_text.replace("\n", " ")
# Type out the processed text
keyboard.type(processed_text)
# Clean up by removing the output filename
# if outpufile exists, remove it
if os.path.exists(output_file):
os.remove(output_file)
if os.path.exists(output_file_main):
os.remove(output_file_main)
# Clean up by removing the recorded audio file
os.remove(filename)
if __name__ == "__main__":
recording_audio = []
with Listener(on_press=on_press, on_release=on_release) as listener:
with sd.InputStream(samplerate=sample_rate, channels=1, callback=recording_callback):
listener.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment