Skip to content

Instantly share code, notes, and snippets.

Last active March 18, 2024 13:17
Show Gist options
  • Save tobiasraabe/58adee67de619ce621464c1a6511d7d9 to your computer and use it in GitHub Desktop.
Save tobiasraabe/58adee67de619ce621464c1a6511d7d9 to your computer and use it in GitHub Desktop.
Script to download files, resume downloads and validate downloads. Everything wrapped with a beautiful progressbar.
import click
import hashlib
import requests
from pathlib import Path
from tqdm import tqdm
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
"""str: Base url for PatentsView. Just to reduce url length."""
URLS = [
"""List: Contains urls of files which need to be downloaded.
Make sure that you add a hash in the same position in ``HASHES`` so that the
integrity of the file can be verified. The hash has to be a lowercase sha265.
The has can be computed in Powershell with ``Get-FileHash <file>``. Notice
that Powershell returns uppercase letters and Python lowercase."""
"""List: Contains sha265 hashes calculated for the files in ``URLS``."""
"""pathlib.Path: Points to the target directory of downloads."""
def downloader(position: int, resume_byte_pos: int = None):
"""Download url in ``URLS[position]`` to disk with possible resumption.
position: int
Position of url.
resume_byte_pos: int
Position of byte from where to resume the download
# Get size of file
url = URLS[position]
r = requests.head(url)
file_size = int(r.headers.get('content-length', 0))
# Append information to resume download at specific byte position
# to header
resume_header = ({'Range': f'bytes={resume_byte_pos}-'}
if resume_byte_pos else None)
# Establish connection
r = requests.get(url, stream=True, headers=resume_header)
# Set configuration
block_size = 1024
initial_pos = resume_byte_pos if resume_byte_pos else 0
mode = 'ab' if resume_byte_pos else 'wb'
file = DOWNLOAD_FOLDER / url.split('/')[-1]
with open(file, mode) as f:
with tqdm(total=file_size, unit='B',
unit_scale=True, unit_divisor=1024,, initial=initial_pos,
ascii=True, miniters=1) as pbar:
for chunk in r.iter_content(32 * block_size):
def download_file(position: int) -> None:
"""Execute the correct download operation.
Depending on the size of the file online and offline, resume the
download if the file offline is smaller than online.
position: int
Position of url.
# Establish connection to header of file
url = URLS[position]
r = requests.head(url)
# Get filesize of online and offline file
file_size_online = int(r.headers.get('content-length', 0))
file = DOWNLOAD_FOLDER / url.split('/')[-1]
if file.exists():
file_size_offline = file.stat().st_size
if file_size_online != file_size_offline:
click.echo(f'File {file} is incomplete. Resume download.')
downloader(position, file_size_offline)
click.echo(f'File {file} is complete. Skip download.')
click.echo(f'File {file} does not exist. Start download.')
def validate_file(position: int) -> None:
"""Validate a given file with its hash.
The downloaded file is hashed and compared to a pre-registered
has value to validate the download procedure.
position: int
Position of url and hash.
file = DOWNLOAD_FOLDER / URLS[position].split('/')[-1]
hash = HASHES[position]
except IndexError:
click.echo(f'File {} has no hash.')
return 0
sha = hashlib.sha256()
with open(file, 'rb') as f:
while True:
chunk = * 1000) # 1MB so that memory is not exhausted
if not chunk:
assert sha.hexdigest() == hash
except AssertionError:
file = URLS[position].split("/")[-1]
click.echo(f'File {file} is corrupt. '
'Delete it manually and restart the program.')
click.echo(f'File {file} is validated.'), chain=True)
def cli():
"""Program for downloading and validating files.
It is possible to run both operations consecutively with
.. code-block:: shell
$ python download validate
To download a file, add the link to ``URLS`` and its hash to ``HASHES`` if
you want to validate downloaded files.
def download():
"""Download files specified in ``URLS``."""
click.echo('\n### Start downloading required files.\n')
for position in range(len(URLS)):
click.echo('\n### End\n')
def validate():
"""Validate downloads with hashes in ``HASHES``."""
click.echo('### Start validating required files.\n')
for position in range(len(URLS)):
click.echo('\n### End\n')
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment