Thomvis/atdv.sh Secret

## atdv.sh
#!/bin/bash
# set -x

# The input video file
INPUT_VIDEO="$1"

# Extract the filename without the extension
FILENAME=$(basename -- "$INPUT_VIDEO")
FILENAME_WITHOUT_EXT="${FILENAME%.*}"

# The output audio file
OUTPUT_AUDIO="${FILENAME_WITHOUT_EXT}.wav"

# Use ffmpeg to extract audio
ffmpeg -n -i "${INPUT_VIDEO}" -vn -acodec pcm_s16le -ar 16000 -ac 2 "${OUTPUT_AUDIO}"

TRANSCRIPTION_JSON="${OUTPUT_AUDIO}.json"

LOG_FILE="${FILENAME_WITHOUT_EXT}.log"
rm $LOG_FILE

if [ ! -f "${TRANSCRIPTION_JSON}" ]; then
	./whisper.cpp/main -m whisper.cpp/models/ggml-small.en-tdrz.bin -f ${OUTPUT_AUDIO} -tdrz -oj
fi

jq -c '.transcription[] | select(.text | test("^[^\\[]"))' "${TRANSCRIPTION_JSON}" | while read -r segment; do

    # Extract the text and timestamps
    text=$(echo $segment | jq -r '.text')
    from_off=$(echo $segment | jq -r '.offsets.from' | tr -d '[:punct:]')
    to_off=$(echo $segment | jq -r '.offsets.to' | tr -d '[:punct:]')

    # Define the output file name based on the timestamps
    translation_output_file="translation_${from_off}_${to_off}.txt"
    speech_output_file="speech_${from_off}_${to_off}.mp3"

    if [ ! -f "${speech_output_file}" ]; then
    	if [ ! -f "${translation_output_file}" ]; then
	    	# Translate
	    	curl -s "https://api.openai.com/v1/chat/completions" \
	    		# -x 'http://localhost:8888' \
	    		-H "Authorization: Bearer $OPENAI_API_KEY" \
			  	-H "Content-Type: application/json" \
			  	-d "{\"model\": \"gpt-4-1106-preview\",\"messages\": [\
			  			{\"role\": \"system\", \"content\": \"Translate this English video dialogue into Dutch for a young audience, keeping the translation similar in length to the original. Ignore tokens in brackets.\"}, \
			  			{\"role\": \"user\", \"content\": \"$text\"} \
			  		]}" \
			  	--output "${translation_output_file}"
    	fi

    	text_translated=$(cat $translation_output_file | jq '.choices[]'.message.content)
    	echo $text_translated

    	# Call the OpenAI TTS API
	    response=$(curl -s -X POST "https://api.openai.com/v1/audio/speech" \
	        -H "Authorization: Bearer $OPENAI_API_KEY" \
	        -H "Content-Type: application/json" \
	        -d "{\"model\": \"tts-1\", \"input\": $text_translated, \"voice\": \"alloy\"}" \
	        --output "${speech_output_file}")
	else
		echo "Skipping ${speech_output_file}, already exists"
	fi

done

MIXED_AUDIO="${FILENAME_WITHOUT_EXT}.mixed.mp3"
rm $MIXED_AUDIO
rm silence.mp3
rm converted.mp3

# create empty file of duration 0
ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t 0 $MIXED_AUDIO

# Iterate over the speech files in natural order
FILLED=0
for FILE in $(ls speech_*.mp3 | sort -V); do
	echo "Processing ${FILE}..."

    # Extract the start (offset) and end times from the file name
    START_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\1/')
    END_MS=$(echo "$FILE" | sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\2/')

    DURATION=$(bc <<< "scale=0; $(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 $FILE) * 1000" | sed -E "s/([0-9]+)\.[0-9]+/\1/")

    # Calculate the duration for the silent segment
    SILENCE_DURATION=$(( START_MS - FILLED ))
    SILENCE_DURATION=$(( SILENCE_DURATION > 1 ? SILENCE_DURATION : 1 ))

    AVAILABLE_DURATION=$(( END_MS - FILLED - SILENCE_DURATION ))

	# Create a silent audio segment with the duration until the next speech segment
    ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t $(printf "%.3f" $(bc <<< "scale=3; $SILENCE_DURATION/1000")) -q:a 9 silence.mp3 >> ${LOG_FILE} 2>&1

    # Concat silence
    echo "Concatting ${SILENCE_DURATION}ms of silence"
	ffmpeg -i "concat:${MIXED_AUDIO}|silence.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
	mv temp.mp3 $MIXED_AUDIO

    SPEED=1
	# Speeding up if needed
    if (( AVAILABLE_DURATION < DURATION )); then
    	SPEED=$(bc <<< "scale=3; $DURATION/$AVAILABLE_DURATION")
    	SPEED=$(bc <<< "scale=3; if (${SPEED} < 1.5) ${SPEED} else 1.5") # set max 150%

    	DURATION=$(bc <<< "scale=3; ${DURATION}/${SPEED}" | sed -E "s/([0-9]+)\.[0-9]+/\1/") # round to ms
    	echo "Speeding up speech by ${SPEED}x"
    fi

    # Convert (to correct audio format & speed) & concat segment
    echo "Concatting ${DURATION}ms of speech"
    ffmpeg -i "$FILE" -ar 44100 -ac 2 -af atempo=$SPEED converted.mp3 >> ${LOG_FILE} 2>&1

    ffmpeg -i "concat:${MIXED_AUDIO}|converted.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
    mv temp.mp3 $MIXED_AUDIO

    (( FILLED += SILENCE_DURATION + DURATION ))
    echo "Running count ${FILLED}ms (end of speech segment in source: ${END_MS}ms)"

    # read -n 1 -s -r -p "Press any key to continue"; echo

    # Cleanup
    rm silence.mp3
    rm converted.mp3
done

# Separate voice from music
NO_VOCALS="separated/htdemucs/${FILENAME_WITHOUT_EXT}/no_vocals.wav"
if [ ! -f "${NO_VOCALS}" ]; then
    python3 -m demucs --two-stems=vocals $OUTPUT_AUDIO
fi

# Merge the new audio track with the original video's audio track
rm final_audio.mp3
ffmpeg -i "$NO_VOCALS" -i "$MIXED_AUDIO" -filter_complex amix=inputs=2:weights='1 3' final_audio.mp3 >> ${LOG_FILE} 2>&1

rm final_video.mp4
ffmpeg -i "$INPUT_VIDEO" -i final_audio.mp3 -c:v copy -map 0:v:0 -map 1:a:0 -shortest final_video.mp4 >> ${LOG_FILE} 2>&1

set +x
	#!/bin/bash
	# set -x

	# The input video file
	INPUT_VIDEO="$1"

	# Extract the filename without the extension
	FILENAME=$(basename -- "$INPUT_VIDEO")
	FILENAME_WITHOUT_EXT="${FILENAME%.*}"

	# The output audio file
	OUTPUT_AUDIO="${FILENAME_WITHOUT_EXT}.wav"

	# Use ffmpeg to extract audio
	ffmpeg -n -i "${INPUT_VIDEO}" -vn -acodec pcm_s16le -ar 16000 -ac 2 "${OUTPUT_AUDIO}"

	TRANSCRIPTION_JSON="${OUTPUT_AUDIO}.json"

	LOG_FILE="${FILENAME_WITHOUT_EXT}.log"
	rm $LOG_FILE

	if [ ! -f "${TRANSCRIPTION_JSON}" ]; then
	./whisper.cpp/main -m whisper.cpp/models/ggml-small.en-tdrz.bin -f ${OUTPUT_AUDIO} -tdrz -oj
	fi

	jq -c '.transcription[] \| select(.text \| test("^[^\\[]"))' "${TRANSCRIPTION_JSON}" \| while read -r segment; do

	# Extract the text and timestamps
	text=$(echo $segment \| jq -r '.text')
	from_off=$(echo $segment \| jq -r '.offsets.from' \| tr -d '[:punct:]')
	to_off=$(echo $segment \| jq -r '.offsets.to' \| tr -d '[:punct:]')

	# Define the output file name based on the timestamps
	translation_output_file="translation_${from_off}_${to_off}.txt"
	speech_output_file="speech_${from_off}_${to_off}.mp3"

	if [ ! -f "${speech_output_file}" ]; then
	if [ ! -f "${translation_output_file}" ]; then
	# Translate
	curl -s "https://api.openai.com/v1/chat/completions" \
	# -x 'http://localhost:8888' \
	-H "Authorization: Bearer $OPENAI_API_KEY" \
	-H "Content-Type: application/json" \
	-d "{\"model\": \"gpt-4-1106-preview\",\"messages\": [\
	{\"role\": \"system\", \"content\": \"Translate this English video dialogue into Dutch for a young audience, keeping the translation similar in length to the original. Ignore tokens in brackets.\"}, \
	{\"role\": \"user\", \"content\": \"$text\"} \
	]}" \
	--output "${translation_output_file}"
	fi

	text_translated=$(cat $translation_output_file \| jq '.choices[]'.message.content)
	echo $text_translated

	# Call the OpenAI TTS API
	response=$(curl -s -X POST "https://api.openai.com/v1/audio/speech" \
	-H "Authorization: Bearer $OPENAI_API_KEY" \
	-H "Content-Type: application/json" \
	-d "{\"model\": \"tts-1\", \"input\": $text_translated, \"voice\": \"alloy\"}" \
	--output "${speech_output_file}")
	else
	echo "Skipping ${speech_output_file}, already exists"
	fi

	done

	MIXED_AUDIO="${FILENAME_WITHOUT_EXT}.mixed.mp3"
	rm $MIXED_AUDIO
	rm silence.mp3
	rm converted.mp3

	# create empty file of duration 0
	ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t 0 $MIXED_AUDIO

	# Iterate over the speech files in natural order
	FILLED=0
	for FILE in $(ls speech_*.mp3 \| sort -V); do
	echo "Processing ${FILE}..."

	# Extract the start (offset) and end times from the file name
	START_MS=$(echo "$FILE" \| sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\1/')
	END_MS=$(echo "$FILE" \| sed -E 's/speech_([0-9]+)_([0-9]+)\.mp3/\2/')

	DURATION=$(bc <<< "scale=0; $(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 $FILE) * 1000" \| sed -E "s/([0-9]+)\.[0-9]+/\1/")

	# Calculate the duration for the silent segment
	SILENCE_DURATION=$(( START_MS - FILLED ))
	SILENCE_DURATION=$(( SILENCE_DURATION > 1 ? SILENCE_DURATION : 1 ))

	AVAILABLE_DURATION=$(( END_MS - FILLED - SILENCE_DURATION ))

	# Create a silent audio segment with the duration until the next speech segment
	ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t $(printf "%.3f" $(bc <<< "scale=3; $SILENCE_DURATION/1000")) -q:a 9 silence.mp3 >> ${LOG_FILE} 2>&1

	# Concat silence
	echo "Concatting ${SILENCE_DURATION}ms of silence"
	ffmpeg -i "concat:${MIXED_AUDIO}\|silence.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
	mv temp.mp3 $MIXED_AUDIO

	SPEED=1
	# Speeding up if needed
	if (( AVAILABLE_DURATION < DURATION )); then
	SPEED=$(bc <<< "scale=3; $DURATION/$AVAILABLE_DURATION")
	SPEED=$(bc <<< "scale=3; if (${SPEED} < 1.5) ${SPEED} else 1.5") # set max 150%

	DURATION=$(bc <<< "scale=3; ${DURATION}/${SPEED}" \| sed -E "s/([0-9]+)\.[0-9]+/\1/") # round to ms
	echo "Speeding up speech by ${SPEED}x"
	fi

	# Convert (to correct audio format & speed) & concat segment
	echo "Concatting ${DURATION}ms of speech"
	ffmpeg -i "$FILE" -ar 44100 -ac 2 -af atempo=$SPEED converted.mp3 >> ${LOG_FILE} 2>&1

	ffmpeg -i "concat:${MIXED_AUDIO}\|converted.mp3" -acodec copy temp.mp3 >> ${LOG_FILE} 2>&1
	mv temp.mp3 $MIXED_AUDIO

	(( FILLED += SILENCE_DURATION + DURATION ))
	echo "Running count ${FILLED}ms (end of speech segment in source: ${END_MS}ms)"

	# read -n 1 -s -r -p "Press any key to continue"; echo

	# Cleanup
	rm silence.mp3
	rm converted.mp3
	done

	# Separate voice from music
	NO_VOCALS="separated/htdemucs/${FILENAME_WITHOUT_EXT}/no_vocals.wav"
	if [ ! -f "${NO_VOCALS}" ]; then
	python3 -m demucs --two-stems=vocals $OUTPUT_AUDIO
	fi

	# Merge the new audio track with the original video's audio track
	rm final_audio.mp3
	ffmpeg -i "$NO_VOCALS" -i "$MIXED_AUDIO" -filter_complex amix=inputs=2:weights='1 3' final_audio.mp3 >> ${LOG_FILE} 2>&1

	rm final_video.mp4
	ffmpeg -i "$INPUT_VIDEO" -i final_audio.mp3 -c:v copy -map 0:v:0 -map 1:a:0 -shortest final_video.mp4 >> ${LOG_FILE} 2>&1

	set +x