pansila/cnn_10_maker.py

## cnn_10_maker.py
import os
import sys
import argparse
import requests
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from vtt_to_srt import vtt_to_srt

OUTPUT_DIR = 'output'
OUTPUT_TMP = os.path.join('output', 'tmp')


try:
    os.mkdir(OUTPUT_DIR)
except FileExistsError:
    pass
try:
    os.mkdir(OUTPUT_TMP)
except FileExistsError:
    pass

def m3u8Downloader(m3u8):
    with open(m3u8, 'r') as f:
        tslist = [line.rstrip() for line in f if line.rstrip().endswith('.ts?null=0')]
    if len(tslist) > 0:
        print('Total '+ str(len(tslist)) +' files')
    else:
        print('No ts file found.')
        return -1
    file, ext = os.path.splitext(os.path.basename(m3u8))
    videoName = os.path.join(OUTPUT_DIR, file + '.mp4')

    index = 1
    tsNames = []
    for tsUrl in tslist:
        videoNameTmp = file + '_' + str(index) + '.ts'
        tsFile = os.path.join(OUTPUT_TMP, videoNameTmp)
        if not os.path.exists(tsFile):
            res = requests.get(tsUrl, stream=True, headers={'Referer':'https://cnnios-f.akamaihd.net'})
            if res.status_code == 200:
                with open(tsFile, 'wb') as f:
                    for chunk in res:
                        f.write(chunk)
                print(videoNameTmp + ' downloaded\r',)
            else:
                print('\nConnection error for url {}: {}'.format(tsUrl, res.status_code))
                #return -1
        tsNames.append(videoNameTmp)
        index += 1

    if index >= len(tslist):
        with open(videoName, 'wb') as f:
            for ts in tsNames:
                tsFile = os.path.join(OUTPUT_TMP, ts)
                with open(tsFile, 'rb') as mergefile:
                    shutil.copyfileobj(mergefile, f)
                os.remove(tsFile)
            print(videoName + ' merged.')
    else:
        print('Merge failed, missing files.')
        return -1
    return 0

def getM3u8(url):
    parsed_tuple = urlparse(url)
    name, m3u8Name = parsed_tuple.path.split('/')[-2:]
    if name.endswith('.mp4.csmil'):
        m3u8Name = name.split('.')[0] + '.m3u8'
    m3u8Path = os.path.join(OUTPUT_DIR, m3u8Name)

    res = requests.get(url)
    if res.status_code == 200:
        #soup = BeautifulSoup(res.text, "html.parser")
        #m3u8Url = str(soup.source['src'])
        m3u8Url = url

        res = requests.get(m3u8Url, stream=True)
        if res.status_code == 200:
            with open(m3u8Path, 'wb') as f:
                for chunk in res:
                    f.write(chunk)
            print(m3u8Name + ' downloaded')
        else:
            return None
    else:
        return None
    return m3u8Path

def format_srt(srt_file):
    LineList = []
    timestamp_line = False
    output_srt_path = srt_file.split('.')[0] + '.srt'
    with open(srt_file, 'r', encoding="utf-8") as srtfile:
        for line in srtfile:
            if len(line) > 4:
                # remove audio only subtitles
                line = line.replace('{\\an8}', '').replace('."', '".')
                # remove audio only subtitles
                if '[' in line and ']' in line:
                    continue
                if '-->' in line:
                    # remove time range only subtitles
                    if timestamp_line:
                        LineList.pop(-1)
                    timestamp_line = True
                else:
                    # merge lines that are broken up into multilines
                    if not timestamp_line:
                        LineList[-1] += ' ' + line.strip()
                        continue
                    timestamp_line = False
                LineList.append(line.rstrip())

    results = []
    clauses = []
    lineNum = 1
    time_range_start = None
    time_range_end = None
    for line in LineList:
        if ' --> ' in line:
            start, end = line.split(' --> ')
            if time_range_start is None:
                time_range_start = start
            time_range_end = end
            continue
        clauses.append(line)
        if line.endswith('.') or line.endswith('?') or line.endswith('!'):
            time_range = time_range_start + ' --> ' + time_range_end
            time_range_start = None
            time_range_end = None
            results.append(str(lineNum))
            results.append(time_range)
            results.append(' '.join(clauses))
            results.append('')
            lineNum += 1
            clauses = []

    with open(output_srt_path, 'w', encoding="utf-8") as srtfileout:
        srtfileout.write('\n'.join(results))

    print('Formatted srt file is saved to "{}"'.format(output_srt_path))

def m3u8url_to_vtturl(url):
    """
    https://pmd.cdn.turner.com/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios__en.vtt
    https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0
    """
    target = url.split(',')[0].split('/')[4:]
    vttURL = 'https://pmd.cdn.turner.com/{}_en.vtt'.format('/'.join(target))
    return vttURL

def getVTT(vttURL):
    vttName = vttURL.split('/')[-1]
    vttPath = os.path.join(OUTPUT_DIR, vttName)
    res = requests.get(vttURL, stream=True)
    if res.status_code == 200:
        with open(vttPath, 'wb') as f:
            for chunk in res:
                f.write(chunk)
        print(vttName + ' downloaded')
    else:
        return None
    return vttPath

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('m3u8URL', help='the m3u8 URL')
    args = parser.parse_args()

    m3u8 = getM3u8(args.m3u8URL)
    vttURL = m3u8url_to_vtturl(args.m3u8URL)
    vtt = getVTT(vttURL)
    vtt_to_srt(vtt)

    srt_file = os.path.splitext(vtt)[0] + '.srt'
    format_srt(srt_file)
    m3u8Downloader(m3u8)
	import os
	import sys
	import argparse
	import requests
	import shutil
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	from vtt_to_srt import vtt_to_srt

	OUTPUT_DIR = 'output'
	OUTPUT_TMP = os.path.join('output', 'tmp')


	try:
	os.mkdir(OUTPUT_DIR)
	except FileExistsError:
	pass
	try:
	os.mkdir(OUTPUT_TMP)
	except FileExistsError:
	pass

	def m3u8Downloader(m3u8):
	with open(m3u8, 'r') as f:
	tslist = [line.rstrip() for line in f if line.rstrip().endswith('.ts?null=0')]
	if len(tslist) > 0:
	print('Total '+ str(len(tslist)) +' files')
	else:
	print('No ts file found.')
	return -1
	file, ext = os.path.splitext(os.path.basename(m3u8))
	videoName = os.path.join(OUTPUT_DIR, file + '.mp4')

	index = 1
	tsNames = []
	for tsUrl in tslist:
	videoNameTmp = file + '_' + str(index) + '.ts'
	tsFile = os.path.join(OUTPUT_TMP, videoNameTmp)
	if not os.path.exists(tsFile):
	res = requests.get(tsUrl, stream=True, headers={'Referer':'https://cnnios-f.akamaihd.net'})
	if res.status_code == 200:
	with open(tsFile, 'wb') as f:
	for chunk in res:
	f.write(chunk)
	print(videoNameTmp + ' downloaded\r',)
	else:
	print('\nConnection error for url {}: {}'.format(tsUrl, res.status_code))
	#return -1
	tsNames.append(videoNameTmp)
	index += 1

	if index >= len(tslist):
	with open(videoName, 'wb') as f:
	for ts in tsNames:
	tsFile = os.path.join(OUTPUT_TMP, ts)
	with open(tsFile, 'rb') as mergefile:
	shutil.copyfileobj(mergefile, f)
	os.remove(tsFile)
	print(videoName + ' merged.')
	else:
	print('Merge failed, missing files.')
	return -1
	return 0

	def getM3u8(url):
	parsed_tuple = urlparse(url)
	name, m3u8Name = parsed_tuple.path.split('/')[-2:]
	if name.endswith('.mp4.csmil'):
	m3u8Name = name.split('.')[0] + '.m3u8'
	m3u8Path = os.path.join(OUTPUT_DIR, m3u8Name)

	res = requests.get(url)
	if res.status_code == 200:
	#soup = BeautifulSoup(res.text, "html.parser")
	#m3u8Url = str(soup.source['src'])
	m3u8Url = url

	res = requests.get(m3u8Url, stream=True)
	if res.status_code == 200:
	with open(m3u8Path, 'wb') as f:
	for chunk in res:
	f.write(chunk)
	print(m3u8Name + ' downloaded')
	else:
	return None
	else:
	return None
	return m3u8Path

	def format_srt(srt_file):
	LineList = []
	timestamp_line = False
	output_srt_path = srt_file.split('.')[0] + '.srt'
	with open(srt_file, 'r', encoding="utf-8") as srtfile:
	for line in srtfile:
	if len(line) > 4:
	# remove audio only subtitles
	line = line.replace('{\\an8}', '').replace('."', '".')
	# remove audio only subtitles
	if '[' in line and ']' in line:
	continue
	if '-->' in line:
	# remove time range only subtitles
	if timestamp_line:
	LineList.pop(-1)
	timestamp_line = True
	else:
	# merge lines that are broken up into multilines
	if not timestamp_line:
	LineList[-1] += ' ' + line.strip()
	continue
	timestamp_line = False
	LineList.append(line.rstrip())

	results = []
	clauses = []
	lineNum = 1
	time_range_start = None
	time_range_end = None
	for line in LineList:
	if ' --> ' in line:
	start, end = line.split(' --> ')
	if time_range_start is None:
	time_range_start = start
	time_range_end = end
	continue
	clauses.append(line)
	if line.endswith('.') or line.endswith('?') or line.endswith('!'):
	time_range = time_range_start + ' --> ' + time_range_end
	time_range_start = None
	time_range_end = None
	results.append(str(lineNum))
	results.append(time_range)
	results.append(' '.join(clauses))
	results.append('')
	lineNum += 1
	clauses = []

	with open(output_srt_path, 'w', encoding="utf-8") as srtfileout:
	srtfileout.write('\n'.join(results))

	print('Formatted srt file is saved to "{}"'.format(output_srt_path))

	def m3u8url_to_vtturl(url):
	"""
	https://pmd.cdn.turner.com/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios__en.vtt
	https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0
	"""
	target = url.split(',')[0].split('/')[4:]
	vttURL = 'https://pmd.cdn.turner.com/{}_en.vtt'.format('/'.join(target))
	return vttURL

	def getVTT(vttURL):
	vttName = vttURL.split('/')[-1]
	vttPath = os.path.join(OUTPUT_DIR, vttName)
	res = requests.get(vttURL, stream=True)
	if res.status_code == 200:
	with open(vttPath, 'wb') as f:
	for chunk in res:
	f.write(chunk)
	print(vttName + ' downloaded')
	else:
	return None
	return vttPath

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('m3u8URL', help='the m3u8 URL')
	args = parser.parse_args()

	m3u8 = getM3u8(args.m3u8URL)
	vttURL = m3u8url_to_vtturl(args.m3u8URL)
	vtt = getVTT(vttURL)
	vtt_to_srt(vtt)

	srt_file = os.path.splitext(vtt)[0] + '.srt'
	format_srt(srt_file)
	m3u8Downloader(m3u8)