ashutoshbsathe/download_videos_from_table.py

## download_videos_from_table.py
# Author : Ashutosh Sathe (2019)
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import requests

fname = 'table.html'
dl_prefix = 'https://archive.nptel.ac.in'
title_idx = -2 # Second last cell text should be title of the downloaded video
dl_link_idx = -1 # Last cell contains the href to the video
save_dir = './videos/'

os.makedirs(save_dir, exist_ok=True)

def proc_href(href: str):
    if href.startswith('/'):
        return dl_prefix + href
    return href

def download_file(url: str, fname: str, chunk_size: int = 1024):
    # https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(fname, 'wb') as f, tqdm(
        desc=fname,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=chunk_size,
    ) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = f.write(data)
            bar.update(size)

html = open(fname).read()
soup = BeautifulSoup(html, 'html.parser')

rows = soup.find('tbody').find_all('tr')
download_info = []

for row in rows:
    cells = row.find_all('td')
    href = proc_href(cells[dl_link_idx].a['href'])
    download_info.append(dict(fname=cells[title_idx].get_text().strip(), url=href, ext=href.split('.')[-1]))

for download in download_info:
    fname = save_dir + '/' + download['fname'] + '.' + download['ext']
    url = download['url']
    download_file(url, fname)
	# Author : Ashutosh Sathe (2019)
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import os
	import requests

	fname = 'table.html'
	dl_prefix = 'https://archive.nptel.ac.in'
	title_idx = -2 # Second last cell text should be title of the downloaded video
	dl_link_idx = -1 # Last cell contains the href to the video
	save_dir = './videos/'

	os.makedirs(save_dir, exist_ok=True)

	def proc_href(href: str):
	if href.startswith('/'):
	return dl_prefix + href
	return href

	def download_file(url: str, fname: str, chunk_size: int = 1024):
	# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
	response = requests.get(url, stream=True)
	total_size = int(response.headers.get('content-length', 0))
	with open(fname, 'wb') as f, tqdm(
	desc=fname,
	total=total_size,
	unit='iB',
	unit_scale=True,
	unit_divisor=chunk_size,
	) as bar:
	for data in response.iter_content(chunk_size=chunk_size):
	size = f.write(data)
	bar.update(size)

	html = open(fname).read()
	soup = BeautifulSoup(html, 'html.parser')

	rows = soup.find('tbody').find_all('tr')
	download_info = []

	for row in rows:
	cells = row.find_all('td')
	href = proc_href(cells[dl_link_idx].a['href'])
	download_info.append(dict(fname=cells[title_idx].get_text().strip(), url=href, ext=href.split('.')[-1]))

	for download in download_info:
	fname = save_dir + '/' + download['fname'] + '.' + download['ext']
	url = download['url']
	download_file(url, fname)