Skip to content

Instantly share code, notes, and snippets.

@luqmansen
Last active June 4, 2021 00:49
Show Gist options
  • Save luqmansen/5c7f65317c306ade7eaaa328eaa4759d to your computer and use it in GitHub Desktop.
Save luqmansen/5c7f65317c306ade7eaaa328eaa4759d to your computer and use it in GitHub Desktop.
import logging
import os
import pickle
from concurrent.futures import ThreadPoolExecutor
from os import path
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
log = logging.getLogger('')
log.setLevel(logging.INFO)
class MultiDownload:
def __init__(self, link, worker=20, max_depth=5):
self.error_link_list = []
self.is_start = True
self.worker = worker
self.max_depth = max_depth
file_name = self.get_file_name(link) + "-" + "link_all.txt"
if not path.exists(file_name):
self.link_all = list(set(self.parse_link(link, [])))
with open(file_name, 'wb') as fp:
pickle.dump(self.link_all, fp)
else:
with open(file_name, 'rb') as fp:
self.link_all = pickle.load(fp)
def start(self):
while len(self.error_link_list) != 0 or self.is_start is True:
with ThreadPoolExecutor(max_workers=self.worker) as executor:
for idx, link in enumerate(self.link_all):
try:
future = self.submit_task(executor, link)
except Exception as e:
log.error(f"Error procesing link | error: {str(e)}")
continue
with open('processed.txt', 'wb') as fp:
pickle.dump(self.link_all, fp)
log.info(f"Result first attempt{future.result()}")
log.info(f"Processing leftover error {len(self.error_link_list)} task")
# processing leftover error
if len(self.error_link_list) == 0:
log.info(f"All task done")
return
with ThreadPoolExecutor(max_workers=self.worker) as executor:
for idx, link in enumerate(self.error_link_list):
try:
future = self.submit_task(executor, link)
except Exception as e:
log.error(f"Error procesing link | error: {str(e)}")
continue
self.error_link_list.pop(idx)
with open('error_link_list.txt', 'wb') as fp:
pickle.dump(self.error_link_list, fp)
is_start = False
log.info(future.result())
def submit_task(self, executor, link):
path = os.getcwd() + "/" + urlparse(link).path.split("/")[-1]
future = executor.submit(self.download, self.error_link_list, link, path)
return future
@staticmethod
def download(error_list, link, filelocation):
try:
log.info(f"Downloading {link}")
r = requests.get(link, stream=True)
# processing file chunk by chunk, since we don't want to
# process them all at once and using all memory
with open(filelocation, 'wb') as f:
log.info(f"Processing chunk for {link}")
for chunk in r.iter_content(1024):
if chunk:
f.write(chunk)
log.info(f"{link} DONE")
except FileExistsError as e:
log.error(f"Error processing {link} | error {str(e)}")
pass
except Exception as e:
log.error(f"Error processing {link} | error {str(e)}")
error_list.append(link)
def parse_link(self, open_directory_url, link_list, depth=0):
"""Traverse link for open directory"""
try:
if depth == self.max_depth:
return []
log.info(f"Traversing level {depth}")
page = requests.get(open_directory_url)
data = page.text
soup = BeautifulSoup(data, features="html.parser")
if soup is None:
return []
# this is content MIME type from header,
# extension = ("jpg", "wav", "mp4", "wmv")
# extension = ("jpg", "wav", "mp4", "wmv", "doc", "txt", "rtf", "DOC", "ppt", "pdf", "xls")
for link in soup.find_all('a'):
page.headers.get("Content-type")
try:
url = open_directory_url + link.get('href')
except:
continue
if url is None:
continue
elif "?C=" in url:
continue
# elif url.endswith(extension):
elif len(url.split(".")[-1]) <= 5: # get file extension, if more than 5, less likely a file extension
link_list.append(url)
continue
else:
log.info(f"Traversing {url}")
result = self.parse_link(url, link_list, depth=depth+1)
link_list.extend(result)
log.debug(link_list)
return link_list
except Exception as e:
log.error(f"Error parsing | error: {str(e)}")
return []
@staticmethod
def get_file_name(url):
path = urlparse(url).path
path = path.split('/')[-2]
return path
@luqmansen
Copy link
Author

Usecase: traversing deep into exposed fileserver and download every single file concurrently

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment