Skip to content

Instantly share code, notes, and snippets.

@rickerp
Created February 28, 2021 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rickerp/a5674c9af97ac2d62b8b8489edfbea28 to your computer and use it in GitHub Desktop.
Save rickerp/a5674c9af97ac2d62b8b8489edfbea28 to your computer and use it in GitHub Desktop.
Python script to recursively get files from html file server
#!/usr/bin/python
import re
import requests
import os
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('URL', help='URL to download files')
ap.add_argument('regex', help='Regex to detect the subdirectories in html')
ap.add_argument("-d", "--directory", help="Directory", default=os.getcwd() + os.sep)
ap.add_argument('-a', '--auth', nargs=2, help="Authentication credentials if needed for request")
args = vars(ap.parse_args())
def download_file(url, path=None, request_params={}):
local_filename = path if path else url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True, **request_params) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
# if chunk:
f.write(chunk)
return local_filename
def getfiles(url, reg, folder=os.getcwd() + '/', auth=None):
print(f"Requesting {url}")
entries = re.findall(reg, requests.get(url, auth=tuple(auth)).text)[1:]
for e in entries:
if e[-1] == '/':
os.mkdir(folder + e)
getfiles(url + e, reg, folder + e, auth)
else:
print(f"Requesting {url + e}")
download_file(url + e, folder + e, {'auth': tuple(auth)})
getfiles(args['URL'], args['regex'], args['directory'], args['auth'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment