Mix.install([
{:req, "~> 0.4.14"},
{:fast_rss, "~> 0.5.0"},
{:bumblebee, "~> 0.5.3"},
{:exla, "~> 0.7.1"},
{:kino, "~> 0.12.3"}
])
rss_feed_url = "https://feeds.fireside.fm/elixiroutlaws/rss"
%{body: rss_body} = Req.get!(rss_feed_url)
{:ok, rss_feed} = FastRSS.parse_rss(rss_body)
# Grab the fields we care about
episodes =
Enum.map(rss_feed["items"], fn item ->
%{
title: item["title"],
url: item["enclosure"]["url"]
}
end)
# For demonstration, limit the number of episodes to download and process
episode_limit = 2
# Establish a temporary directory to store downloaded podcast episodes
download_directory = Path.join(System.tmp_dir!(), "podcast-downloads")
File.mkdir_p!(download_directory)
episodes =
episodes
|> Enum.take(episode_limit)
|> Enum.map(fn episode ->
filename = URI.parse(episode.url) |> Map.fetch!(:path) |> Path.basename()
out_path = Path.join(download_directory, filename)
Req.get!(url: episode.url, into: File.stream!(out_path))
Map.put(episode, :local_path, out_path)
end)
# Download and initialize Whisper model
# Note that other models may have higher accuracy at a cost of slower runtime
{:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
serving =
Bumblebee.Audio.speech_to_text_whisper(whisper, featurizer, tokenizer, generation_config,
defn_options: [compiler: EXLA],
chunk_num_seconds: 30,
timestamps: :segments
)
# Add Homebrew path, necessary for Mac ffmpeg
os_path = System.get_env("PATH")
homebrew_bin_path = "/opt/homebrew/bin"
if :os.type() == {:unix, :darwin} and not String.contains?(os_path, homebrew_bin_path) do
System.put_env("PATH", os_path <> ":" <> homebrew_bin_path)
end
episodes =
Enum.map(episodes, fn episode ->
start_time = DateTime.utc_now()
transcription_output = Nx.Serving.run(serving, {:file, episode.local_path})
end_time = DateTime.utc_now()
Map.merge(episode, %{
transcription: transcription_output.chunks,
transcription_processing_seconds: DateTime.diff(end_time, start_time)
})
end)
calculate_transcription_speed_ratio = fn episode ->
audio_length =
episode.transcription
|> Enum.map(fn chunk -> chunk.end_timestamp_seconds end)
|> Enum.max()
IO.inspect(episode)
audio_length / episode.transcription_processing_seconds
end
chunk_to_markdown = fn chunk ->
"- #{chunk.start_timestamp_seconds}: #{chunk.text}"
end
episode_to_markdown = fn episode ->
speed_ratio = Float.round(calculate_transcription_speed_ratio.(episode), 2)
"""
# #{episode.title}
Transcribed by Whisper at #{speed_ratio}x speed.
## Transcript
#{Enum.map(episode.transcription, &chunk_to_markdown.(&1)) |> Enum.join("\n")}
"""
end
Kino.Markdown.new(episode_to_markdown.(Enum.at(episodes, 0)))
do you think using XLA requires more precomplation of the model than using tensorflow ? that's why it takes more time and space ?