Skip to content

Instantly share code, notes, and snippets.

@verginer
Last active April 8, 2023 06:01
Show Gist options
  • Save verginer/808b7c271049bd2c5d360c7bf5276665 to your computer and use it in GitHub Desktop.
Save verginer/808b7c271049bd2c5d360c7bf5276665 to your computer and use it in GitHub Desktop.
script to download all files with a given extension from a given website
import os
import click
import requests
from requests.compat import urljoin
from lxml.html import etree
def extract_links_with_ext(url, extension):
response = requests.get(url)
doc_tree = etree.HTML(response.content)
hrefs = doc_tree.xpath('//a/@href')
partial_links = [ref for ref in hrefs if ref.endswith(extension)]
return {urljoin(response.url, ref) for ref in partial_links}
def download_files_from_url(url, extension, output_path):
"""download all files with given ext from url and save to ouput_path"""
file_links = extract_links_with_ext(url, extension)
num_links = len(file_links)
for i, file_link in enumerate(sorted(file_links)):
file_name = file_link.split('/')[-1]
file_path = f'{output_path}/{file_name}'
print(f'Downloading file {i}/{num_links}: {file_link}')
try:
if os.path.exists(file_path):
print(f'{file_path} already downloaded, skipping')
continue
res = requests.get(file_link, stream=True)
if res.status_code != 200:
err_code = res.status_code
print(f'could not download {file_link}, error {err_code}')
with open(file_path, 'wb') as f:
for chunk in res.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except Exception as err:
os.remove(file_path)
raise err
@click.command('download files with ext')
@click.argument('url', type=click.STRING)
@click.argument('out_path', type=click.Path(file_okay=False, exists=True))
@click.option('-e', '--ext', default='zip', type=click.STRING,
help='the extension of the file to download (default `zip`)')
def main(url, out_path, ext):
download_files_from_url(url, ext, out_path)
if __name__ == '__main__':
main()
@verginer
Copy link
Author

verginer commented Oct 31, 2017

For example to download all *.7z files from https://archive.org/download/stackexchange/ you can run

python3 download_files_with_ext.py --ext '.7z' https://archive.org/download/stackexchange/ ./data/

which will download all files into the directory data

requires:

  • python >= 3.6 (because of use of fstrings) can be adapted if the print function is changed to use the available string formatting available in that version
  • requests
  • lxml
  • click

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment