Last active
June 4, 2021 00:49
-
-
Save luqmansen/5c7f65317c306ade7eaaa328eaa4759d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import pickle | |
from concurrent.futures import ThreadPoolExecutor | |
from os import path | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') | |
log = logging.getLogger('') | |
log.setLevel(logging.INFO) | |
class MultiDownload: | |
def __init__(self, link, worker=20, max_depth=5): | |
self.error_link_list = [] | |
self.is_start = True | |
self.worker = worker | |
self.max_depth = max_depth | |
file_name = self.get_file_name(link) + "-" + "link_all.txt" | |
if not path.exists(file_name): | |
self.link_all = list(set(self.parse_link(link, []))) | |
with open(file_name, 'wb') as fp: | |
pickle.dump(self.link_all, fp) | |
else: | |
with open(file_name, 'rb') as fp: | |
self.link_all = pickle.load(fp) | |
def start(self): | |
while len(self.error_link_list) != 0 or self.is_start is True: | |
with ThreadPoolExecutor(max_workers=self.worker) as executor: | |
for idx, link in enumerate(self.link_all): | |
try: | |
future = self.submit_task(executor, link) | |
except Exception as e: | |
log.error(f"Error procesing link | error: {str(e)}") | |
continue | |
with open('processed.txt', 'wb') as fp: | |
pickle.dump(self.link_all, fp) | |
log.info(f"Result first attempt{future.result()}") | |
log.info(f"Processing leftover error {len(self.error_link_list)} task") | |
# processing leftover error | |
if len(self.error_link_list) == 0: | |
log.info(f"All task done") | |
return | |
with ThreadPoolExecutor(max_workers=self.worker) as executor: | |
for idx, link in enumerate(self.error_link_list): | |
try: | |
future = self.submit_task(executor, link) | |
except Exception as e: | |
log.error(f"Error procesing link | error: {str(e)}") | |
continue | |
self.error_link_list.pop(idx) | |
with open('error_link_list.txt', 'wb') as fp: | |
pickle.dump(self.error_link_list, fp) | |
is_start = False | |
log.info(future.result()) | |
def submit_task(self, executor, link): | |
path = os.getcwd() + "/" + urlparse(link).path.split("/")[-1] | |
future = executor.submit(self.download, self.error_link_list, link, path) | |
return future | |
@staticmethod | |
def download(error_list, link, filelocation): | |
try: | |
log.info(f"Downloading {link}") | |
r = requests.get(link, stream=True) | |
# processing file chunk by chunk, since we don't want to | |
# process them all at once and using all memory | |
with open(filelocation, 'wb') as f: | |
log.info(f"Processing chunk for {link}") | |
for chunk in r.iter_content(1024): | |
if chunk: | |
f.write(chunk) | |
log.info(f"{link} DONE") | |
except FileExistsError as e: | |
log.error(f"Error processing {link} | error {str(e)}") | |
pass | |
except Exception as e: | |
log.error(f"Error processing {link} | error {str(e)}") | |
error_list.append(link) | |
def parse_link(self, open_directory_url, link_list, depth=0): | |
"""Traverse link for open directory""" | |
try: | |
if depth == self.max_depth: | |
return [] | |
log.info(f"Traversing level {depth}") | |
page = requests.get(open_directory_url) | |
data = page.text | |
soup = BeautifulSoup(data, features="html.parser") | |
if soup is None: | |
return [] | |
# this is content MIME type from header, | |
# extension = ("jpg", "wav", "mp4", "wmv") | |
# extension = ("jpg", "wav", "mp4", "wmv", "doc", "txt", "rtf", "DOC", "ppt", "pdf", "xls") | |
for link in soup.find_all('a'): | |
page.headers.get("Content-type") | |
try: | |
url = open_directory_url + link.get('href') | |
except: | |
continue | |
if url is None: | |
continue | |
elif "?C=" in url: | |
continue | |
# elif url.endswith(extension): | |
elif len(url.split(".")[-1]) <= 5: # get file extension, if more than 5, less likely a file extension | |
link_list.append(url) | |
continue | |
else: | |
log.info(f"Traversing {url}") | |
result = self.parse_link(url, link_list, depth=depth+1) | |
link_list.extend(result) | |
log.debug(link_list) | |
return link_list | |
except Exception as e: | |
log.error(f"Error parsing | error: {str(e)}") | |
return [] | |
@staticmethod | |
def get_file_name(url): | |
path = urlparse(url).path | |
path = path.split('/')[-2] | |
return path |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usecase: traversing deep into exposed fileserver and download every single file concurrently