A simple Livebook for doing audio-to-text transcription using OpenAI's Whisper model.
Last active
May 30, 2024 14:43
-
-
Save DeedleFake/23b27bf3b90b46567a02da99b0fdf666 to your computer and use it in GitHub Desktop.
Whisper Audio Transcription Livebook
Mix.install(
[
{:kino, "~> 0.12.3"},
{:bumblebee, "~> 0.5.3"},
{:exla, "~> 0.7.2"}
],
config: [nx: [default_backend: EXLA.Backend]]
)
progress = Kino.Frame.new(placeholder: false)
Kino.Frame.render(progress, Kino.Text.new("Loading model..."))
Kino.render(progress)
repo = {:hf, "openai/whisper-base"}
{:ok, whisper} = Bumblebee.load_model(repo)
{:ok, featurizer} = Bumblebee.load_featurizer(repo)
{:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
{:ok, generation_config} = Bumblebee.load_generation_config(repo)
serving =
Bumblebee.Audio.speech_to_text_whisper(whisper, featurizer, tokenizer, generation_config,
defn_options: [compiler: EXLA],
chunk_num_seconds: 30,
timestamps: :segments,
compile: [batch_size: 16]
)
Kino.start_child!({Nx.Serving, serving: serving, name: Serving})
Kino.Frame.clear(progress)
Kino.nothing()
form =
Kino.Control.form(
[
audio: Kino.Input.audio("Audio", sampling_rate: featurizer.sampling_rate)
],
submit: "Transcribe"
)
input = Kino.Frame.new()
Kino.Frame.render(input, form)
input
defmodule Transcribe do
use GenServer
@input_busy Kino.Markdown.new("## Transcribing...")
@output_header Kino.Markdown.new("## Transcript")
@output_no_input Kino.Markdown.new("## No input given.")
@output_table EEx.compile_string("""
<table>
<tr>
<th>Seconds</th>
<th>Text</th>
</tr>
<%= for {time, text} <- chunks do %>
<tr>
<td><%= time %></td>
<td><%= text %></td>
</tr>
<% end %>
</table>
""")
def start_link(opts) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@impl true
def init(opts) do
state = opts |> Keyword.validate!([:form, :input]) |> Map.new()
{:ok, state, {:continue, :init}}
end
@impl true
def handle_continue(:init, state) do
Kino.Control.subscribe(state.form, :transcribe)
state = state |> Map.put(:output, Kino.Frame.new())
Kino.render(state.output)
{:noreply, state}
end
@impl true
def handle_info({:transcribe, %{type: :submit, data: %{audio: nil}}}, state) do
Kino.Frame.render(state.output, @output_no_input)
{:noreply, state}
end
@impl true
def handle_info({:transcribe, %{type: :submit, data: data}}, state) do
try do
Kino.Frame.render(state.input, @input_busy)
Kino.Frame.render(state.output, @output_header)
%{format: :pcm_f32} = audio = data.audio
tensor =
Kino.Input.file_path(audio.file_ref)
|> File.read!()
|> Nx.from_binary(:f32)
|> Nx.reshape({:auto, audio.num_channels})
|> Nx.mean(axes: [1])
chunks =
Nx.Serving.batched_run(Serving, tensor)
|> Map.fetch!(:chunks)
|> Enum.map(&clean_chunk/1)
Kino.Frame.append(
state.output,
Kino.Download.new(fn -> chunks_to_csv(chunks) end,
filename: "transcript.csv",
label: "Download as CSV"
)
)
{html, _} = Code.eval_quoted(@output_table, chunks: chunks)
Kino.Frame.append(
state.output,
Kino.HTML.new(html)
)
{:noreply, state}
after
Kino.Frame.render(state.input, state.form)
end
end
defp clean_chunk(%{text: text, start_timestamp_seconds: time}) do
time =
time
|> round()
|> Time.from_seconds_after_midnight()
{time, text}
end
defp chunks_to_csv(chunks) do
chunks
|> Enum.map_join("\n", fn {time, text} ->
"#{time},#{csv_escape(text)}"
end)
end
defp csv_escape(text) do
text = String.replace(text, "\"", "\"\"")
"\"#{text}\""
end
end
Kino.start_child!({Transcribe, form: form, input: input})
Kino.nothing()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment