Skip to content

Instantly share code, notes, and snippets.

Forked from gavrilov/
Created January 27, 2024 07:08
Show Gist options
  • Save CGAlei/df9d5d77d40b6bd250bc5a838357a844 to your computer and use it in GitHub Desktop.
Save CGAlei/df9d5d77d40b6bd250bc5a838357a844 to your computer and use it in GitHub Desktop.
Obsidian voice recognition with local Whisper model

Obsidian voice recognition with local Whisper model

Install plugin Whisper for Obsidian

plugin's settings:

create folders rec and rec_notes in Obsidian.

Create whisper-api-server folder for local server
create python venv

python -m venv venv

create file
create folder tmp
put ffmpeg.exe in venv/Scripts/

activate venv and install:

pip install torch torchvision torchaudio --index-url
pip install uvicorn fastapi
pip install git+
pip install python-multipart

create file start_whisper_server.bat and start the server

You're awesome!

win11+gpu tested

from fastapi import FastAPI, Form, UploadFile, File, Request
from fastapi import HTTPException, status
from fastapi.middleware.cors import CORSMiddleware
import os
import shutil
import uvicorn
from functools import lru_cache
from pathlib import Path
from typing import Any, List, Union, Optional
from datetime import timedelta
import numpy as np
import whisper
app = FastAPI()
origins = ["*"]
#url \
# -H "Authorization: Bearer $OPENAI_API_KEY" \
# -H "Content-Type: multipart/form-data" \
# -F model="whisper-1" \
# -F file="@/path/to/file/openai.mp3"
# "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger..."
# -----
# copied from
# Whisper transcription functions
# ----------------
def get_whisper_model(whisper_model: str):
"""Get a whisper model from the cache or download it if it doesn't exist"""
model = whisper.load_model(whisper_model)
return model
def transcribe(audio_path: str, whisper_model: str, **whisper_args):
"""Transcribe the audio file using whisper"""
# Get whisper model
# NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size
transcriber = get_whisper_model(whisper_model)
# Set configs & transcribe
if whisper_args["temperature_increment_on_fallback"] is not None:
whisper_args["temperature"] = tuple(
np.arange(whisper_args["temperature"], 1.0 + 1e-6, whisper_args["temperature_increment_on_fallback"])
whisper_args["temperature"] = [whisper_args["temperature"]]
del whisper_args["temperature_increment_on_fallback"]
transcript = transcriber.transcribe(
return transcript
# "whisper_model": "base",
"whisper_model": "large-v2",
"temperature": 0.0,
"temperature_increment_on_fallback": 0.2,
"no_speech_threshold": 0.6,
"logprob_threshold": -1.0,
"compression_ratio_threshold": 2.4,
"condition_on_previous_text": True,
"verbose": False,
# "verbose": True,
"task": "transcribe",
# "task": "translation",
# -----'/v1/audio/transcriptions')
async def transcriptions(model: str = Form(...),
file: UploadFile = File(...),
response_format: Optional[str] = Form(None),
prompt: Optional[str] = Form(None),
temperature: Optional[float] = Form(None),
language: Optional[str] = Form(None)):
assert model == "whisper-1"
if file is None:
raise HTTPException(
detail=f"Bad Request, bad file"
if response_format is None:
response_format = 'json'
if response_format not in ['json',
raise HTTPException(
detail=f"Bad Request, bad response_format"
if temperature is None:
temperature = 0.0
if temperature < 0.0 or temperature > 1.0:
raise HTTPException(
detail=f"Bad Request, bad temperature"
filename = file.filename
fileobj = file.file
upload_name = os.path.join(UPLOAD_DIR, filename)
upload_file = open(upload_name, 'wb+')
shutil.copyfileobj(fileobj, upload_file)
transcript = transcribe(audio_path=upload_name, **WHISPER_DEFAULT_SETTINGS)
if response_format in ['text']:
return transcript['text']
if response_format in ['srt']:
ret = ""
for seg in transcript['segments']:
td_s = timedelta(milliseconds=seg["start"]*1000)
td_e = timedelta(milliseconds=seg["end"]*1000)
t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}'
t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}'
ret += '{}\n{} --> {}\n{}\n\n'.format(seg["id"], t_s, t_e, seg["text"])
ret += '\n'
return ret
if response_format in ['vtt']:
ret = "WEBVTT\n\n"
for seg in transcript['segments']:
td_s = timedelta(milliseconds=seg["start"]*1000)
td_e = timedelta(milliseconds=seg["end"]*1000)
t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}'
t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}'
ret += "{} --> {}\n{}\n\n".format(t_s, t_e, seg["text"])
return ret
if response_format in ['verbose_json']:
transcript.setdefault('task', WHISPER_DEFAULT_SETTINGS['task'])
transcript.setdefault('duration', transcript['segments'][-1]['end'])
if transcript['language'] == 'ja':
transcript['language'] = 'japanese'
return transcript
return {'text': transcript['text']}
if __name__ == "__main__":, host="", port=8000)
@echo on
cd %~dp0
call .\venv\Scripts\activate.bat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment