Skip to content

Instantly share code, notes, and snippets.

@ashutoshbsathe
Created August 4, 2022 06:14
Show Gist options
  • Save ashutoshbsathe/aaafb80f62d098b1ae9aba6eecae0c21 to your computer and use it in GitHub Desktop.
Save ashutoshbsathe/aaafb80f62d098b1ae9aba6eecae0c21 to your computer and use it in GitHub Desktop.
Downloading videos from a table in a directory. Uses `bs4` for parsing html, `requests` for download and `tqdm` for progressbar
# Author : Ashutosh Sathe (2019)
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import requests
fname = 'table.html'
dl_prefix = 'https://archive.nptel.ac.in'
title_idx = -2 # Second last cell text should be title of the downloaded video
dl_link_idx = -1 # Last cell contains the href to the video
save_dir = './videos/'
os.makedirs(save_dir, exist_ok=True)
def proc_href(href: str):
if href.startswith('/'):
return dl_prefix + href
return href
def download_file(url: str, fname: str, chunk_size: int = 1024):
# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
with open(fname, 'wb') as f, tqdm(
desc=fname,
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=chunk_size,
) as bar:
for data in response.iter_content(chunk_size=chunk_size):
size = f.write(data)
bar.update(size)
html = open(fname).read()
soup = BeautifulSoup(html, 'html.parser')
rows = soup.find('tbody').find_all('tr')
download_info = []
for row in rows:
cells = row.find_all('td')
href = proc_href(cells[dl_link_idx].a['href'])
download_info.append(dict(fname=cells[title_idx].get_text().strip(), url=href, ext=href.split('.')[-1]))
for download in download_info:
fname = save_dir + '/' + download['fname'] + '.' + download['ext']
url = download['url']
download_file(url, fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment