Skip to content

Instantly share code, notes, and snippets.

@regstuff
Last active January 23, 2024 08:33
Show Gist options
  • Save regstuff/4c355550fcbd7f81fd9fcde206af8c39 to your computer and use it in GitHub Desktop.
Save regstuff/4c355550fcbd7f81fd9fcde206af8c39 to your computer and use it in GitHub Desktop.
Convert whisper.cpp colored terminal out into a html colored file, for transcript confidence scores
#!/bin/bash
# Use with a command such as: (./main -m models/ggml-medium.bin -p 1 -t 8 -pc -f input.wav | ./ansi2html.sh > whisper.html && ./match_colors.sh whisper.html)
# The open and close parantheses in the command are important.
# Assumes ansi2html.sh is in the same folder. ansi2html.sh is available here: https://github.com/pixelb/scripts/blob/master/scripts/ansi2html.sh & here: https://gist.github.com/regstuff/a9cb16df25c74d10608a6bff3a3df95d
# Will generate an ouput called whisper.html. That can be parsed and used for final transcript. All low-confidence i.e. red or orange printed colors are converted to a span class named red, which can be used for further css downstream
# This script assumes bc (Basic Calculator) is installed and available in your UNIX environment. It's a common calculator utility available on many Unix-like systems.
# Check for the presence of an input file argument
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <input-html-file>"
exit 1
fi
# Read the first argument as the file name
input_file=$1
# Check if the file exists
if [ ! -f "$input_file" ]; then
echo "Error: File '$input_file' not found."
exit 1
fi
# Define rainbow colors with their hex values
declare -A rainbow_colors
rainbow_colors=(
[Red]=FF0000
[Orange]=FFA500
[Yellow]=FFFF00
[Green]=008000
[Blue]=0000FF
[Indigo]=4B0082
[Violet]=EE82EE
)
# extract the CSS content between <style> tags.
css=$(awk '/<style type="text\/css">/,/<\/style>/' "$input_file" | sed '1d;$d')
# Extract span classes from the input HTML file
classes=$(grep -o 'class="[^"]\+"' "$input_file" | cut -d '"' -f2 | sort -u)
# Function to convert a hex color to RGB
hex_to_rgb() {
local hex_color="$1"
R=$((16#${hex_color:0:2}))
G=$((16#${hex_color:2:2}))
B=$((16#${hex_color:4:2}))
}
# Euclidean distance function
get_distance() {
echo "sqrt(($1 - $4)^2 + ($2 - $5)^2 + ($3 - $6)^2)" | bc
}
# Find the nearest rainbow color
get_nearest_rainbow_color() {
local min_distance=1000000
local nearest_color=""
hex_to_rgb "$1"
local src_r=$R
local src_g=$G
local src_b=$B
for color_name in "${!rainbow_colors[@]}"; do
hex_to_rgb "${rainbow_colors[$color_name]}"
local distance=$(get_distance $src_r $src_g $src_b $R $G $B)
if (( $(echo "$distance < $min_distance" | bc) )); then
min_distance=$distance
nearest_color=$color_name
fi
done
echo "$nearest_color"
}
# Iterate over each unique class
while read -r class; do
# Find the corresponding color in the extracted CSS
color=$(echo "$css" | awk -v class=".$class " '$0 ~ class && /color:/ { sub(/^.*color:\s*#/, ""); sub(/;.*$/, ""); print }' | head -n 1)
if [[ -n "$color" ]]; then
# Find nearest rainbow color
nearest_rainbow=$(get_nearest_rainbow_color "$color")
# Output the class and its corresponding nearest rainbow color
echo "$class matches nearest rainbow color $nearest_rainbow"
fi
done <<< "$classes"
# Iterate over each unique class
for class in $classes; do
# Find the corresponding color in the extracted CSS
color=$(echo "$css" | awk -v class=".$class " '$0 ~ class && /color:/ { sub(/^.*color:\s*#/, ""); sub(/;.*$/, ""); print }')
if [[ -n "$color" ]]; then
# Convert color to uppercase for matching with defined values
color=${color^^}
# Find nearest rainbow color
nearest_rainbow=$(get_nearest_rainbow_color "$color")
# Output the class and its corresponding nearest rainbow color
echo "$class matches nearest rainbow color $nearest_rainbow"
# If the nearest color is Red or Orange, replace the class name with "red" in the input file
if [[ "$nearest_rainbow" == "Red" || "$nearest_rainbow" == "Orange" ]]; then
# Replace the class name with "red" in the HTML file
sed -i "s/class=\"$class\"/class=\"red\"/g" "$input_file"
else
# Replace the class name with plain `span` in the HTML file
sed -i "s/class=\"$class\"//g" "$input_file"
fi
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment