ziyan0302/vtt2txt.py

## vtt2txt.py
"""
Convert YouTube subtitles(vtt) to human readable text.

Download only subtitles from YouTube with youtube-dl:
youtube-dl -o ytdl-subs --skip-download --write-sub --sub-format vtt <video_url>

Note that default subtitle format provided by YouTube is ass, which is hard
to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
is easier to process.

To conver all vtt files inside a directory:
find . -name "*.vtt" -exec python vtt2text.py {} \;
"""

import sys
import re
import pdb


def remove_tags(text):
    """
    Remove vtt markup tags
    """
    tags = [
        r'</c>',
        r'<c(\.color\w+)?>',
        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

    ]

    for pat in tags:
        text = re.sub(pat, '', text)

    # extract timestamp, only kep HH:MM
    text = re.sub(
        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
        r'\g<1>',
        text
    )

    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

def remove_header(lines):
    """
    Remove vtt file header
    """
    pos = -1
    for mark in ('##', 'Language: en',):
        if mark in lines:
            pos = lines.index(mark)
    lines = lines[pos+1:]
    return lines


def merge_duplicates(lines) :
    """
    Remove duplicated subtitles. Duplacates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
        if line == "":
            continue
        if re.match('^\d{2}:\d{2}$', line):
            if line != last_timestamp:
                yield line
                last_timestamp = line
        else:
            if line != last_cap:
                yield line
                last_cap = line

def  rm_timestamp(lines):
    """
    Remove the timestamp like this form:"00:00:06.913 --> 00:00:10.871"
    """
    # print("rm_time")
    part_lines = lines
    # print("part done")
    for line in part_lines:
        if re.match('^\d{2}:\d{2}:', line):
            continue
        else:
            yield line


def merge_short_lines(lines):
    buffer = ''
    for line in lines:
        if line == "" or re.match('^\d{2}:\d{2}$', line):
            yield '\n' + line
            continue

        if len(line+buffer) < 80:
            buffer += ' ' + line
        else:
            yield buffer.strip()
            yield buffer.strip()
            buffer = line
    yield buffer

def merge_short(lines):
    buffer = ''
    conti = False
    for line in lines:
        if line == "":
            if conti == True:
                yield buffer
                yield ""
                conti = False
                continue
            else:
                yield ""
                continue
        else:
            if conti == False:
                buffer = line
                conti = True
                continue
            else:
                if re.search("\s$",buffer):
                    yield buffer + line
                else:
                    yield buffer + " "+ line
                conti = False

def merge_sentence(lines):
    buffer = ""
    last_line = ""
    uncomplete = False
    for line in lines:
        print(line)
        if line == "":
            if uncomplete == True:
                continue
            else:
                yield line
        else:
            if len(buffer+line) < 60:
                if re.search(r"[A-Za-z0-9]$", line):
                    uncomplete = True
                    buffer = line
                    continue
                else:
                    if uncomplete == True and line == "":
                        continue
                    elif uncomplete == True and line !="":
                        uncomplete = False
                        yield buffer + " " + line
                    else:
                        yield line
            else:
                yield line


def main():
    vtt_file_name = sys.argv[1]
    txt_name =  re.sub(r'.vtt$', '2.txt', vtt_file_name)
    with open(vtt_file_name) as f:
        text = f.read()
    text = remove_tags(text)
    lines = text.splitlines()
    lines = remove_header(lines)
    lines = rm_timestamp(lines)
    lines = list(lines)

    lines = merge_short(lines)
    lines = list(lines)


    with open(txt_name, 'w') as f:
        for line in lines:
            f.write(line)
            f.write("\n")


if __name__ == "__main__":
    main()
	"""
	Convert YouTube subtitles(vtt) to human readable text.

	Download only subtitles from YouTube with youtube-dl:
	youtube-dl -o ytdl-subs --skip-download --write-sub --sub-format vtt <video_url>

	Note that default subtitle format provided by YouTube is ass, which is hard
	to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
	is easier to process.

	To conver all vtt files inside a directory:
	find . -name "*.vtt" -exec python vtt2text.py {} \;
	"""

	import sys
	import re
	import pdb


	def remove_tags(text):
	"""
	Remove vtt markup tags
	"""
	tags = [
	r'</c>',
	r'<c(\.color\w+)?>',
	r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

	]

	for pat in tags:
	text = re.sub(pat, '', text)

	# extract timestamp, only kep HH:MM
	text = re.sub(
	r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
	r'\g<1>',
	text
	)

	text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
	return text

	def remove_header(lines):
	"""
	Remove vtt file header
	"""
	pos = -1
	for mark in ('##', 'Language: en',):
	if mark in lines:
	pos = lines.index(mark)
	lines = lines[pos+1:]
	return lines


	def merge_duplicates(lines) :
	"""
	Remove duplicated subtitles. Duplacates are always adjacent.
	"""
	last_timestamp = ''
	last_cap = ''
	for line in lines:
	if line == "":
	continue
	if re.match('^\d{2}:\d{2}$', line):
	if line != last_timestamp:
	yield line
	last_timestamp = line
	else:
	if line != last_cap:
	yield line
	last_cap = line

	def rm_timestamp(lines):
	"""
	Remove the timestamp like this form:"00:00:06.913 --> 00:00:10.871"
	"""
	# print("rm_time")
	part_lines = lines
	# print("part done")
	for line in part_lines:
	if re.match('^\d{2}:\d{2}:', line):
	continue
	else:
	yield line



	def merge_short_lines(lines):
	buffer = ''
	for line in lines:
	if line == "" or re.match('^\d{2}:\d{2}$', line):
	yield '\n' + line
	continue

	if len(line+buffer) < 80:
	buffer += ' ' + line
	else:
	yield buffer.strip()
	yield buffer.strip()
	buffer = line
	yield buffer

	def merge_short(lines):
	buffer = ''
	conti = False
	for line in lines:
	if line == "":
	if conti == True:
	yield buffer
	yield ""
	conti = False
	continue
	else:
	yield ""
	continue
	else:
	if conti == False:
	buffer = line
	conti = True
	continue
	else:
	if re.search("\s$",buffer):
	yield buffer + line
	else:
	yield buffer + " "+ line
	conti = False

	def merge_sentence(lines):
	buffer = ""
	last_line = ""
	uncomplete = False
	for line in lines:
	print(line)
	if line == "":
	if uncomplete == True:
	continue
	else:
	yield line
	else:
	if len(buffer+line) < 60:
	if re.search(r"[A-Za-z0-9]$", line):
	uncomplete = True
	buffer = line
	continue
	else:
	if uncomplete == True and line == "":
	continue
	elif uncomplete == True and line !="":
	uncomplete = False
	yield buffer + " " + line
	else:
	yield line
	else:
	yield line





	def main():
	vtt_file_name = sys.argv[1]
	txt_name = re.sub(r'.vtt$', '2.txt', vtt_file_name)
	with open(vtt_file_name) as f:
	text = f.read()
	text = remove_tags(text)
	lines = text.splitlines()
	lines = remove_header(lines)
	lines = rm_timestamp(lines)
	lines = list(lines)

	lines = merge_short(lines)
	lines = list(lines)



	with open(txt_name, 'w') as f:
	for line in lines:
	f.write(line)
	f.write("\n")



	if __name__ == "__main__":
	main()