-
-
Save Thomvis/474e16941959ece4d9d579c3f6e8c706 to your computer and use it in GitHub Desktop.
Automated translation & dubbing of speech in video using AI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set -x | |
# The input video file | |
INPUT_VIDEO="$1" | |
# Extract the filename without the extension | |
FILENAME=$(basename -- "$INPUT_VIDEO") | |
FILENAME_WITHOUT_EXT="${FILENAME%.*}" | |
# The output audio file | |
OUTPUT_AUDIO="${FILENAME_WITHOUT_EXT}.wav" | |
# Use ffmpeg to extract audio | |
ffmpeg -n -i "${INPUT_VIDEO}" -vn -acodec pcm_s16le -ar 16000 -ac 2 "${OUTPUT_AUDIO}" | |
TRANSCRIPTION_JSON="${OUTPUT_AUDIO}.json" | |
LOG_FILE="${FILENAME_WITHOUT_EXT}.log" | |
rm $LOG_FILE | |
if [ ! -f "${TRANSCRIPTION_JSON}" ]; then | |
./whisper.cpp/main -m whisper.cpp/models/ggml-small.en-tdrz.bin -f ${OUTPUT_AUDIO} -tdrz -oj | |
fi | |
jq -c '.transcription[] | select(.text | test("^[^\\[]"))' "${TRANSCRIPTION_JSON}" | while read -r segment; do | |
# Extract the text and timestamps | |
text=$(echo $segment | jq -r '.text') | |
from_off=$(echo $segment | jq -r '.offsets.from' | tr -d '[:punct:]') | |
to_off=$(echo $segment | jq -r '.offsets.to' | tr -d '[:punct:]') | |
# Define the output file name based on the timestamps | |
translation_output_file="translation_${from_off}_${to_off}.txt" | |
speech_output_file="speech_${from_off}_${to_off}.mp3" | |
if [ ! -f "${speech_output_file}" ]; then | |
if [ ! -f "${translation_output_file}" ]; then | |
# Translate | |
curl -s "https://api.openai.com/v1/chat/completions" \ | |
# -x 'http://localhost:8888' \ | |
-H "Authorization: Bearer $OPENAI_API_KEY" \ | |
-H "Content-Type: application/json" \ | |
-d "{\"model\": \"gpt-4-1106-preview\",\"messages\": [\ | |
{\"role\": \"system\", \"content\": \"Translate this English video dialogue into Dutch for a young audience, keeping the translation similar in length to the original. Ignore tokens in brackets.\"}, \ | |
{\"role\": \"user\", \"content\": \"$text\"} \ | |
]}" \ | |
--output "${translation_output_file}" | |
fi | |
text_translated=$(cat $translation_output_file | jq '.choices[]'.message.content) | |
echo $text_translated | |
# Call the OpenAI TTS API | |
response=$(curl -s -X POST "https://api.openai.com/v1/audio/speech" \ | |
-H "Authorization: Bearer $OPENAI_API_KEY" \ | |
-H "Content-Type: application/json" \ | |
-d "{\"model\": \"tts-1\", \"input\": $text_translated, \"voice\": \"alloy\"}" \ | |
--output "${speech_output_file}") | |
else | |
echo "Skipping ${speech_output_file}, already exists" | |
fi | |
done | |
MIXED_AUDIO="${FILENAME_WITHOUT_EXT}.mixed.mp3" | |
rm $MIXED_AUDIO | |
rm silence.mp3 | |
rm converted.mp3 | |
# create empty file of duration 0 | |
ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t 0 $MIXED_AUDIO | |
# Iterate over the speech files in natural order | |
FILLED=0 | |
for FILE in $(ls speech_*.mp3 | sort -V); do | |
echo "Processing ${FILE}..." | |
# Extract the start (offset) and end times from the file name | |
START_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\1/') | |
END_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\2/') | |
DURATION=$(bc <<< "scale=0; $(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 $FILE) * 1000" | sed -E "s/([0-9]+)\.[0-9]+/\1/") | |
# Calculate the duration for the silent segment | |
SILENCE_DURATION=$(( START_MS - FILLED )) | |
SILENCE_DURATION=$(( SILENCE_DURATION > 1 ? SILENCE_DURATION : 1 )) | |
AVAILABLE_DURATION=$(( END_MS - FILLED - SILENCE_DURATION )) | |
# Create a silent audio segment with the duration until the next speech segment | |
ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t $(printf "%.3f" $(bc <<< "scale=3; $SILENCE_DURATION/1000")) -q:a 9 silence.mp3 >> ${LOG_FILE} 2>&1 | |
# Concat silence | |
echo "Concatting ${SILENCE_DURATION}ms of silence" | |
ffmpeg -i "concat:${MIXED_AUDIO}|silence.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1 | |
mv temp.mp3 $MIXED_AUDIO | |
SPEED=1 | |
# Speeding up if needed | |
if (( AVAILABLE_DURATION < DURATION )); then | |
SPEED=$(bc <<< "scale=3; $DURATION/$AVAILABLE_DURATION") | |
SPEED=$(bc <<< "scale=3; if (${SPEED} < 1.5) ${SPEED} else 1.5") # set max 150% | |
DURATION=$(bc <<< "scale=3; ${DURATION}/${SPEED}" | sed -E "s/([0-9]+)\.[0-9]+/\1/") # round to ms | |
echo "Speeding up speech by ${SPEED}x" | |
fi | |
# Convert (to correct audio format & speed) & concat segment | |
echo "Concatting ${DURATION}ms of speech" | |
ffmpeg -i "$FILE" -ar 44100 -ac 2 -af atempo=$SPEED converted.mp3 >> ${LOG_FILE} 2>&1 | |
ffmpeg -i "concat:${MIXED_AUDIO}|converted.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1 | |
mv temp.mp3 $MIXED_AUDIO | |
(( FILLED += SILENCE_DURATION + DURATION )) | |
echo "Running count ${FILLED}ms (end of speech segment in source: ${END_MS}ms)" | |
# read -n 1 -s -r -p "Press any key to continue"; echo | |
# Cleanup | |
rm silence.mp3 | |
rm converted.mp3 | |
done | |
# Separate voice from music | |
NO_VOCALS="separated/htdemucs/${FILENAME_WITHOUT_EXT}/no_vocals.wav" | |
if [ ! -f "${NO_VOCALS}" ]; then | |
python3 -m demucs --two-stems=vocals $OUTPUT_AUDIO | |
fi | |
# Merge the new audio track with the original video's audio track | |
rm final_audio.mp3 | |
ffmpeg -i "$NO_VOCALS" -i "$MIXED_AUDIO" -filter_complex amix=inputs=2:weights='1 3' final_audio.mp3 >> ${LOG_FILE} 2>&1 | |
rm final_video.mp4 | |
ffmpeg -i "$INPUT_VIDEO" -i final_audio.mp3 -c:v copy -map 0:v:0 -map 1:a:0 -shortest final_video.mp4 >> ${LOG_FILE} 2>&1 | |
set +x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment