sebbacon/gist:bee36e34e5aef06b8f0fbe7f310cd877

## gistfile1.txt
"""Hacky script to download audio from a BBC interview and transcribe, with OpenAI tools


"""

import subprocess
from optparse import OptionParser


# Define command line options
parser = OptionParser()
parser.add_option("-u", "--url", dest="url", help="URL of the BBC sound to download")
parser.add_option(
    "-s",
    "--start",
    dest="start",
    help="Start time to trim using ffmpeg in the format HH:MM:SS",
)
parser.add_option(
    "-e",
    "--end",
    dest="end",
    help="End time to trim using ffmpeg in the format HH:MM:SS",
)
parser.add_option(
    "-i", "--interviewer", dest="interviewer", help="Name of the interviewer"
)
parser.add_option(
    "-n", "--interviewee", dest="interviewee", help="Name of the interviewee"
)
parser.add_option("-t", "--topic", dest="topic", help="topic of the interview")

# Parse command line options
(options, args) = parser.parse_args()

# Check if required options are provided
required_options = [
    "url",
    "start",
    "end",
    "interviewer",
    "interviewee",
    "topic",
]
for opt in required_options:
    if getattr(options, opt) is None:
        parser.error(f"Required option --{opt} not provided.")

# Convert timecodes to seconds for ffmpeg
start_seconds = sum(
    int(x) * 60**i for i, x in enumerate(reversed(options.start.split(":")))
)
end_seconds = sum(
    int(x) * 60**i for i, x in enumerate(reversed(options.end.split(":")))
)
duration = end_seconds - start_seconds

# Run get-iplayer command
subprocess.run(
    ["get-iplayer", "--subtitles", options.url, '--file-prefix="audio"'], check=True
)


# Run ffmpeg command
subprocess.run(
    [
        "ffmpeg",
        "-y",
        "-ss",
        str(start_seconds),
        "-t",
        str(duration),
        "-i",
        "audio.m4a",
        "-vn",
        "audio_clipped.m4a",
    ],
    check=True,
)

# Run openai api command
with open("transcription.txt", "w") as outfile:
    subprocess.run(
        ["openai", "api", "audio.transcribe", "-f", "audio_clipped.m4a"],
        stdout=outfile,
        check=True,
    )

# Run llm prompt command
system_prompt = f"""You are an expert subtitle transcriber.

The topic of this interview is {options.topic}.

It is a radio interview between two people; infer when the person talking changes,
and prefix each change of speaker with *{options.interviewer}:* and *{options.interviewee}:*.


Add some suitable paragraph breaks to this text to make it more readable.

Don't change any of the text itself.

Remove any text at the start and end which are not part of the interview"""

with open("transcription.txt", "r") as infile, open(
    "cleaned_transcription.txt", "w"
) as outfile:
    subprocess.run(
        ["llm", "prompt", "--model", "gpt-3.5-turbo", "--system", system_prompt],
        stdin=infile,
        stdout=outfile,
        check=True,
    )
	"""Hacky script to download audio from a BBC interview and transcribe, with OpenAI tools


	"""

	import subprocess
	from optparse import OptionParser


	# Define command line options
	parser = OptionParser()
	parser.add_option("-u", "--url", dest="url", help="URL of the BBC sound to download")
	parser.add_option(
	"-s",
	"--start",
	dest="start",
	help="Start time to trim using ffmpeg in the format HH:MM:SS",
	)
	parser.add_option(
	"-e",
	"--end",
	dest="end",
	help="End time to trim using ffmpeg in the format HH:MM:SS",
	)
	parser.add_option(
	"-i", "--interviewer", dest="interviewer", help="Name of the interviewer"
	)
	parser.add_option(
	"-n", "--interviewee", dest="interviewee", help="Name of the interviewee"
	)
	parser.add_option("-t", "--topic", dest="topic", help="topic of the interview")

	# Parse command line options
	(options, args) = parser.parse_args()

	# Check if required options are provided
	required_options = [
	"url",
	"start",
	"end",
	"interviewer",
	"interviewee",
	"topic",
	]
	for opt in required_options:
	if getattr(options, opt) is None:
	parser.error(f"Required option --{opt} not provided.")

	# Convert timecodes to seconds for ffmpeg
	start_seconds = sum(
	int(x) * 60**i for i, x in enumerate(reversed(options.start.split(":")))
	)
	end_seconds = sum(
	int(x) * 60**i for i, x in enumerate(reversed(options.end.split(":")))
	)
	duration = end_seconds - start_seconds

	# Run get-iplayer command
	subprocess.run(
	["get-iplayer", "--subtitles", options.url, '--file-prefix="audio"'], check=True
	)


	# Run ffmpeg command
	subprocess.run(
	[
	"ffmpeg",
	"-y",
	"-ss",
	str(start_seconds),
	"-t",
	str(duration),
	"-i",
	"audio.m4a",
	"-vn",
	"audio_clipped.m4a",
	],
	check=True,
	)

	# Run openai api command
	with open("transcription.txt", "w") as outfile:
	subprocess.run(
	["openai", "api", "audio.transcribe", "-f", "audio_clipped.m4a"],
	stdout=outfile,
	check=True,
	)

	# Run llm prompt command
	system_prompt = f"""You are an expert subtitle transcriber.

	The topic of this interview is {options.topic}.

	It is a radio interview between two people; infer when the person talking changes,
	and prefix each change of speaker with {options.interviewer}: and {options.interviewee}:.


	Add some suitable paragraph breaks to this text to make it more readable.

	Don't change any of the text itself.

	Remove any text at the start and end which are not part of the interview"""

	with open("transcription.txt", "r") as infile, open(
	"cleaned_transcription.txt", "w"
	) as outfile:
	subprocess.run(
	["llm", "prompt", "--model", "gpt-3.5-turbo", "--system", system_prompt],
	stdin=infile,
	stdout=outfile,
	check=True,
	)