Skip to content

Instantly share code, notes, and snippets.

@Thomvis

Thomvis/atdv.sh Secret

Created November 17, 2023 22:43
Show Gist options
  • Save Thomvis/474e16941959ece4d9d579c3f6e8c706 to your computer and use it in GitHub Desktop.
Save Thomvis/474e16941959ece4d9d579c3f6e8c706 to your computer and use it in GitHub Desktop.
Automated translation & dubbing of speech in video using AI
#!/bin/bash
# set -x
# The input video file
INPUT_VIDEO="$1"
# Extract the filename without the extension
FILENAME=$(basename -- "$INPUT_VIDEO")
FILENAME_WITHOUT_EXT="${FILENAME%.*}"
# The output audio file
OUTPUT_AUDIO="${FILENAME_WITHOUT_EXT}.wav"
# Use ffmpeg to extract audio
ffmpeg -n -i "${INPUT_VIDEO}" -vn -acodec pcm_s16le -ar 16000 -ac 2 "${OUTPUT_AUDIO}"
TRANSCRIPTION_JSON="${OUTPUT_AUDIO}.json"
LOG_FILE="${FILENAME_WITHOUT_EXT}.log"
rm $LOG_FILE
if [ ! -f "${TRANSCRIPTION_JSON}" ]; then
./whisper.cpp/main -m whisper.cpp/models/ggml-small.en-tdrz.bin -f ${OUTPUT_AUDIO} -tdrz -oj
fi
jq -c '.transcription[] | select(.text | test("^[^\\[]"))' "${TRANSCRIPTION_JSON}" | while read -r segment; do
# Extract the text and timestamps
text=$(echo $segment | jq -r '.text')
from_off=$(echo $segment | jq -r '.offsets.from' | tr -d '[:punct:]')
to_off=$(echo $segment | jq -r '.offsets.to' | tr -d '[:punct:]')
# Define the output file name based on the timestamps
translation_output_file="translation_${from_off}_${to_off}.txt"
speech_output_file="speech_${from_off}_${to_off}.mp3"
if [ ! -f "${speech_output_file}" ]; then
if [ ! -f "${translation_output_file}" ]; then
# Translate
curl -s "https://api.openai.com/v1/chat/completions" \
# -x 'http://localhost:8888' \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d "{\"model\": \"gpt-4-1106-preview\",\"messages\": [\
{\"role\": \"system\", \"content\": \"Translate this English video dialogue into Dutch for a young audience, keeping the translation similar in length to the original. Ignore tokens in brackets.\"}, \
{\"role\": \"user\", \"content\": \"$text\"} \
]}" \
--output "${translation_output_file}"
fi
text_translated=$(cat $translation_output_file | jq '.choices[]'.message.content)
echo $text_translated
# Call the OpenAI TTS API
response=$(curl -s -X POST "https://api.openai.com/v1/audio/speech" \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d "{\"model\": \"tts-1\", \"input\": $text_translated, \"voice\": \"alloy\"}" \
--output "${speech_output_file}")
else
echo "Skipping ${speech_output_file}, already exists"
fi
done
MIXED_AUDIO="${FILENAME_WITHOUT_EXT}.mixed.mp3"
rm $MIXED_AUDIO
rm silence.mp3
rm converted.mp3
# create empty file of duration 0
ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t 0 $MIXED_AUDIO
# Iterate over the speech files in natural order
FILLED=0
for FILE in $(ls speech_*.mp3 | sort -V); do
echo "Processing ${FILE}..."
# Extract the start (offset) and end times from the file name
START_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\1/')
END_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\2/')
DURATION=$(bc <<< "scale=0; $(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 $FILE) * 1000" | sed -E "s/([0-9]+)\.[0-9]+/\1/")
# Calculate the duration for the silent segment
SILENCE_DURATION=$(( START_MS - FILLED ))
SILENCE_DURATION=$(( SILENCE_DURATION > 1 ? SILENCE_DURATION : 1 ))
AVAILABLE_DURATION=$(( END_MS - FILLED - SILENCE_DURATION ))
# Create a silent audio segment with the duration until the next speech segment
ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t $(printf "%.3f" $(bc <<< "scale=3; $SILENCE_DURATION/1000")) -q:a 9 silence.mp3 >> ${LOG_FILE} 2>&1
# Concat silence
echo "Concatting ${SILENCE_DURATION}ms of silence"
ffmpeg -i "concat:${MIXED_AUDIO}|silence.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
mv temp.mp3 $MIXED_AUDIO
SPEED=1
# Speeding up if needed
if (( AVAILABLE_DURATION < DURATION )); then
SPEED=$(bc <<< "scale=3; $DURATION/$AVAILABLE_DURATION")
SPEED=$(bc <<< "scale=3; if (${SPEED} < 1.5) ${SPEED} else 1.5") # set max 150%
DURATION=$(bc <<< "scale=3; ${DURATION}/${SPEED}" | sed -E "s/([0-9]+)\.[0-9]+/\1/") # round to ms
echo "Speeding up speech by ${SPEED}x"
fi
# Convert (to correct audio format & speed) & concat segment
echo "Concatting ${DURATION}ms of speech"
ffmpeg -i "$FILE" -ar 44100 -ac 2 -af atempo=$SPEED converted.mp3 >> ${LOG_FILE} 2>&1
ffmpeg -i "concat:${MIXED_AUDIO}|converted.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
mv temp.mp3 $MIXED_AUDIO
(( FILLED += SILENCE_DURATION + DURATION ))
echo "Running count ${FILLED}ms (end of speech segment in source: ${END_MS}ms)"
# read -n 1 -s -r -p "Press any key to continue"; echo
# Cleanup
rm silence.mp3
rm converted.mp3
done
# Separate voice from music
NO_VOCALS="separated/htdemucs/${FILENAME_WITHOUT_EXT}/no_vocals.wav"
if [ ! -f "${NO_VOCALS}" ]; then
python3 -m demucs --two-stems=vocals $OUTPUT_AUDIO
fi
# Merge the new audio track with the original video's audio track
rm final_audio.mp3
ffmpeg -i "$NO_VOCALS" -i "$MIXED_AUDIO" -filter_complex amix=inputs=2:weights='1 3' final_audio.mp3 >> ${LOG_FILE} 2>&1
rm final_video.mp4
ffmpeg -i "$INPUT_VIDEO" -i final_audio.mp3 -c:v copy -map 0:v:0 -map 1:a:0 -shortest final_video.mp4 >> ${LOG_FILE} 2>&1
set +x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment