Created
August 4, 2022 06:14
-
-
Save ashutoshbsathe/aaafb80f62d098b1ae9aba6eecae0c21 to your computer and use it in GitHub Desktop.
Downloading videos from a table in a directory. Uses `bs4` for parsing html, `requests` for download and `tqdm` for progressbar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author : Ashutosh Sathe (2019) | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import os | |
import requests | |
fname = 'table.html' | |
dl_prefix = 'https://archive.nptel.ac.in' | |
title_idx = -2 # Second last cell text should be title of the downloaded video | |
dl_link_idx = -1 # Last cell contains the href to the video | |
save_dir = './videos/' | |
os.makedirs(save_dir, exist_ok=True) | |
def proc_href(href: str): | |
if href.startswith('/'): | |
return dl_prefix + href | |
return href | |
def download_file(url: str, fname: str, chunk_size: int = 1024): | |
# https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51 | |
response = requests.get(url, stream=True) | |
total_size = int(response.headers.get('content-length', 0)) | |
with open(fname, 'wb') as f, tqdm( | |
desc=fname, | |
total=total_size, | |
unit='iB', | |
unit_scale=True, | |
unit_divisor=chunk_size, | |
) as bar: | |
for data in response.iter_content(chunk_size=chunk_size): | |
size = f.write(data) | |
bar.update(size) | |
html = open(fname).read() | |
soup = BeautifulSoup(html, 'html.parser') | |
rows = soup.find('tbody').find_all('tr') | |
download_info = [] | |
for row in rows: | |
cells = row.find_all('td') | |
href = proc_href(cells[dl_link_idx].a['href']) | |
download_info.append(dict(fname=cells[title_idx].get_text().strip(), url=href, ext=href.split('.')[-1])) | |
for download in download_info: | |
fname = save_dir + '/' + download['fname'] + '.' + download['ext'] | |
url = download['url'] | |
download_file(url, fname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment