Skip to content

Instantly share code, notes, and snippets.

@pansila
Last active May 22, 2021 08:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save pansila/0868305f0dbf6ba817264e5bfea0e0aa to your computer and use it in GitHub Desktop.
Save pansila/0868305f0dbf6ba817264e5bfea0e0aa to your computer and use it in GitHub Desktop.
Download srt and video file from CNN offical website for subs2srs to make Anki cards.
import os
import sys
import argparse
import requests
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from vtt_to_srt import vtt_to_srt
OUTPUT_DIR = 'output'
OUTPUT_TMP = os.path.join('output', 'tmp')
try:
os.mkdir(OUTPUT_DIR)
except FileExistsError:
pass
try:
os.mkdir(OUTPUT_TMP)
except FileExistsError:
pass
def m3u8Downloader(m3u8):
with open(m3u8, 'r') as f:
tslist = [line.rstrip() for line in f if line.rstrip().endswith('.ts?null=0')]
if len(tslist) > 0:
print('Total '+ str(len(tslist)) +' files')
else:
print('No ts file found.')
return -1
file, ext = os.path.splitext(os.path.basename(m3u8))
videoName = os.path.join(OUTPUT_DIR, file + '.mp4')
index = 1
tsNames = []
for tsUrl in tslist:
videoNameTmp = file + '_' + str(index) + '.ts'
tsFile = os.path.join(OUTPUT_TMP, videoNameTmp)
if not os.path.exists(tsFile):
res = requests.get(tsUrl, stream=True, headers={'Referer':'https://cnnios-f.akamaihd.net'})
if res.status_code == 200:
with open(tsFile, 'wb') as f:
for chunk in res:
f.write(chunk)
print(videoNameTmp + ' downloaded\r',)
else:
print('\nConnection error for url {}: {}'.format(tsUrl, res.status_code))
#return -1
tsNames.append(videoNameTmp)
index += 1
if index >= len(tslist):
with open(videoName, 'wb') as f:
for ts in tsNames:
tsFile = os.path.join(OUTPUT_TMP, ts)
with open(tsFile, 'rb') as mergefile:
shutil.copyfileobj(mergefile, f)
os.remove(tsFile)
print(videoName + ' merged.')
else:
print('Merge failed, missing files.')
return -1
return 0
def getM3u8(url):
parsed_tuple = urlparse(url)
name, m3u8Name = parsed_tuple.path.split('/')[-2:]
if name.endswith('.mp4.csmil'):
m3u8Name = name.split('.')[0] + '.m3u8'
m3u8Path = os.path.join(OUTPUT_DIR, m3u8Name)
res = requests.get(url)
if res.status_code == 200:
#soup = BeautifulSoup(res.text, "html.parser")
#m3u8Url = str(soup.source['src'])
m3u8Url = url
res = requests.get(m3u8Url, stream=True)
if res.status_code == 200:
with open(m3u8Path, 'wb') as f:
for chunk in res:
f.write(chunk)
print(m3u8Name + ' downloaded')
else:
return None
else:
return None
return m3u8Path
def format_srt(srt_file):
LineList = []
timestamp_line = False
output_srt_path = srt_file.split('.')[0] + '.srt'
with open(srt_file, 'r', encoding="utf-8") as srtfile:
for line in srtfile:
if len(line) > 4:
# remove audio only subtitles
line = line.replace('{\\an8}', '').replace('."', '".')
# remove audio only subtitles
if '[' in line and ']' in line:
continue
if '-->' in line:
# remove time range only subtitles
if timestamp_line:
LineList.pop(-1)
timestamp_line = True
else:
# merge lines that are broken up into multilines
if not timestamp_line:
LineList[-1] += ' ' + line.strip()
continue
timestamp_line = False
LineList.append(line.rstrip())
results = []
clauses = []
lineNum = 1
time_range_start = None
time_range_end = None
for line in LineList:
if ' --> ' in line:
start, end = line.split(' --> ')
if time_range_start is None:
time_range_start = start
time_range_end = end
continue
clauses.append(line)
if line.endswith('.') or line.endswith('?') or line.endswith('!'):
time_range = time_range_start + ' --> ' + time_range_end
time_range_start = None
time_range_end = None
results.append(str(lineNum))
results.append(time_range)
results.append(' '.join(clauses))
results.append('')
lineNum += 1
clauses = []
with open(output_srt_path, 'w', encoding="utf-8") as srtfileout:
srtfileout.write('\n'.join(results))
print('Formatted srt file is saved to "{}"'.format(output_srt_path))
def m3u8url_to_vtturl(url):
"""
https://pmd.cdn.turner.com/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios__en.vtt
https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0
"""
target = url.split(',')[0].split('/')[4:]
vttURL = 'https://pmd.cdn.turner.com/{}_en.vtt'.format('/'.join(target))
return vttURL
def getVTT(vttURL):
vttName = vttURL.split('/')[-1]
vttPath = os.path.join(OUTPUT_DIR, vttName)
res = requests.get(vttURL, stream=True)
if res.status_code == 200:
with open(vttPath, 'wb') as f:
for chunk in res:
f.write(chunk)
print(vttName + ' downloaded')
else:
return None
return vttPath
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('m3u8URL', help='the m3u8 URL')
args = parser.parse_args()
m3u8 = getM3u8(args.m3u8URL)
vttURL = m3u8url_to_vtturl(args.m3u8URL)
vtt = getVTT(vttURL)
vtt_to_srt(vtt)
srt_file = os.path.splitext(vtt)[0] + '.srt'
format_srt(srt_file)
m3u8Downloader(m3u8)
@pansila
Copy link
Author

pansila commented Jun 30, 2020

Prerequisites:

  1. python3

  2. install dependencies

pip install requests
pip install beautifulsoup4
pip install vtt_to_srt3

Usage:

python cnn_10_maker.py https://cnnios-f.akamaihd.net/i/cnn/big/cnn10/2020/06/17/caption/ten-0619.cnn_3282394_ios_,440,650,840,1240,3000,5500,.mp4.csmil/index_1_av.m3u8?null=0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment