Skip to content

Instantly share code, notes, and snippets.

@DeedleFake
Last active May 30, 2024 14:43
Show Gist options
  • Save DeedleFake/23b27bf3b90b46567a02da99b0fdf666 to your computer and use it in GitHub Desktop.
Save DeedleFake/23b27bf3b90b46567a02da99b0fdf666 to your computer and use it in GitHub Desktop.
Whisper Audio Transcription Livebook

Audio Transcription

Run in Livebook

A simple Livebook for doing audio-to-text transcription using OpenAI's Whisper model.

Audio Transcription

Mix.install(
  [
    {:kino, "~> 0.12.3"},
    {:bumblebee, "~> 0.5.3"},
    {:exla, "~> 0.7.2"}
  ],
  config: [nx: [default_backend: EXLA.Backend]]
)

Section

progress = Kino.Frame.new(placeholder: false)
Kino.Frame.render(progress, Kino.Text.new("Loading model..."))
Kino.render(progress)

repo = {:hf, "openai/whisper-base"}
{:ok, whisper} = Bumblebee.load_model(repo)
{:ok, featurizer} = Bumblebee.load_featurizer(repo)
{:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
{:ok, generation_config} = Bumblebee.load_generation_config(repo)

serving =
  Bumblebee.Audio.speech_to_text_whisper(whisper, featurizer, tokenizer, generation_config,
    defn_options: [compiler: EXLA],
    chunk_num_seconds: 30,
    timestamps: :segments,
    compile: [batch_size: 16]
  )

Kino.start_child!({Nx.Serving, serving: serving, name: Serving})

Kino.Frame.clear(progress)

Kino.nothing()
form =
  Kino.Control.form(
    [
      audio: Kino.Input.audio("Audio", sampling_rate: featurizer.sampling_rate)
    ],
    submit: "Transcribe"
  )

input = Kino.Frame.new()
Kino.Frame.render(input, form)

input
defmodule Transcribe do
  use GenServer

  @input_busy Kino.Markdown.new("## Transcribing...")
  @output_header Kino.Markdown.new("## Transcript")
  @output_no_input Kino.Markdown.new("## No input given.")

  @output_table EEx.compile_string("""
                <table>
                  <tr>
                    <th>Seconds</th>
                    <th>Text</th>
                  </tr>
                  <%= for {time, text} <- chunks do %>
                    <tr>
                      <td><%= time %></td>
                      <td><%= text %></td>
                    </tr>
                  <% end %>
                </table>
                """)

  def start_link(opts) do
    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
  end

  @impl true
  def init(opts) do
    state = opts |> Keyword.validate!([:form, :input]) |> Map.new()
    {:ok, state, {:continue, :init}}
  end

  @impl true
  def handle_continue(:init, state) do
    Kino.Control.subscribe(state.form, :transcribe)

    state = state |> Map.put(:output, Kino.Frame.new())
    Kino.render(state.output)

    {:noreply, state}
  end

  @impl true
  def handle_info({:transcribe, %{type: :submit, data: %{audio: nil}}}, state) do
    Kino.Frame.render(state.output, @output_no_input)
    {:noreply, state}
  end

  @impl true
  def handle_info({:transcribe, %{type: :submit, data: data}}, state) do
    try do
      Kino.Frame.render(state.input, @input_busy)
      Kino.Frame.render(state.output, @output_header)

      %{format: :pcm_f32} = audio = data.audio

      tensor =
        Kino.Input.file_path(audio.file_ref)
        |> File.read!()
        |> Nx.from_binary(:f32)
        |> Nx.reshape({:auto, audio.num_channels})
        |> Nx.mean(axes: [1])

      chunks =
        Nx.Serving.batched_run(Serving, tensor)
        |> Map.fetch!(:chunks)
        |> Enum.map(&clean_chunk/1)

      Kino.Frame.append(
        state.output,
        Kino.Download.new(fn -> chunks_to_csv(chunks) end,
          filename: "transcript.csv",
          label: "Download as CSV"
        )
      )

      {html, _} = Code.eval_quoted(@output_table, chunks: chunks)

      Kino.Frame.append(
        state.output,
        Kino.HTML.new(html)
      )

      {:noreply, state}
    after
      Kino.Frame.render(state.input, state.form)
    end
  end

  defp clean_chunk(%{text: text, start_timestamp_seconds: time}) do
    time =
      time
      |> round()
      |> Time.from_seconds_after_midnight()

    {time, text}
  end

  defp chunks_to_csv(chunks) do
    chunks
    |> Enum.map_join("\n", fn {time, text} ->
      "#{time},#{csv_escape(text)}"
    end)
  end

  defp csv_escape(text) do
    text = String.replace(text, "\"", "\"\"")
    "\"#{text}\""
  end
end

Kino.start_child!({Transcribe, form: form, input: input})
Kino.nothing()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment