luqmansen/multi_threaded_download.py

## multi_threaded_download.py
import logging
import os
import pickle
from concurrent.futures import ThreadPoolExecutor
from os import path
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
log = logging.getLogger('')
log.setLevel(logging.INFO)


class MultiDownload:

    def __init__(self, link, worker=20, max_depth=5):
        self.error_link_list = []
        self.is_start = True
        self.worker = worker
        self.max_depth = max_depth
        file_name = self.get_file_name(link) + "-" + "link_all.txt"
        if not path.exists(file_name):
            self.link_all = list(set(self.parse_link(link, [])))
            with open(file_name, 'wb') as fp:
                pickle.dump(self.link_all, fp)
        else:
            with open(file_name, 'rb') as fp:
                self.link_all = pickle.load(fp)

    def start(self):
        while len(self.error_link_list) != 0 or self.is_start is True:
            with ThreadPoolExecutor(max_workers=self.worker) as executor:
                for idx, link in enumerate(self.link_all):
                    try:
                        future = self.submit_task(executor, link)
                    except Exception as e:
                        log.error(f"Error procesing link | error: {str(e)}")
                        continue
                    with open('processed.txt', 'wb') as fp:
                        pickle.dump(self.link_all, fp)

            log.info(f"Result first attempt{future.result()}")
            log.info(f"Processing leftover error {len(self.error_link_list)} task")
            # processing leftover error
            if len(self.error_link_list) == 0:
                log.info(f"All task done")
                return
            with ThreadPoolExecutor(max_workers=self.worker) as executor:
                for idx, link in enumerate(self.error_link_list):
                    try:
                        future = self.submit_task(executor, link)
                    except Exception as e:
                        log.error(f"Error procesing link | error: {str(e)}")
                        continue
                    self.error_link_list.pop(idx)
                    with open('error_link_list.txt', 'wb') as fp:
                        pickle.dump(self.error_link_list, fp)
            is_start = False
            log.info(future.result())

    def submit_task(self, executor, link):
        path = os.getcwd() + "/" + urlparse(link).path.split("/")[-1]
        future = executor.submit(self.download, self.error_link_list, link, path)
        return future

    @staticmethod
    def download(error_list, link, filelocation):
        try:
            log.info(f"Downloading {link}")
            r = requests.get(link, stream=True)
            # processing file chunk by chunk, since we don't want to
            # process them all at once and using all memory
            with open(filelocation, 'wb') as f:
                log.info(f"Processing chunk for {link}")
                for chunk in r.iter_content(1024):
                    if chunk:
                        f.write(chunk)
            log.info(f"{link} DONE")
        except FileExistsError as e:
            log.error(f"Error processing {link} | error {str(e)}")
            pass
        except Exception as e:
            log.error(f"Error processing {link} | error {str(e)}")
            error_list.append(link)

    def parse_link(self, open_directory_url, link_list, depth=0):
        """Traverse link for open directory"""
        try:
            if depth == self.max_depth:
                return []
            log.info(f"Traversing level {depth}")
            page = requests.get(open_directory_url)
            data = page.text
            soup = BeautifulSoup(data, features="html.parser")
            if soup is None:
                return []

            # this is content MIME type from header,
            # extension = ("jpg", "wav", "mp4", "wmv")
            # extension = ("jpg", "wav", "mp4", "wmv", "doc", "txt", "rtf", "DOC", "ppt", "pdf", "xls")

            for link in soup.find_all('a'):
                page.headers.get("Content-type")
                try:
                    url = open_directory_url + link.get('href')
                except:
                    continue
                if url is None:
                    continue
                elif "?C=" in url:
                    continue
                # elif url.endswith(extension):
                elif len(url.split(".")[-1]) <= 5:  # get file extension, if more than 5, less likely a file extension
                    link_list.append(url)
                    continue
                else:
                    log.info(f"Traversing {url}")
                    result = self.parse_link(url, link_list, depth=depth+1)
                    link_list.extend(result)

            log.debug(link_list)
            return link_list

        except Exception as e:
            log.error(f"Error parsing | error: {str(e)}")
            return []

    @staticmethod
    def get_file_name(url):
        path = urlparse(url).path
        path = path.split('/')[-2]
        return path
	import logging
	import os
	import pickle
	from concurrent.futures import ThreadPoolExecutor
	from os import path
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup

	logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
	log = logging.getLogger('')
	log.setLevel(logging.INFO)


	class MultiDownload:

	def __init__(self, link, worker=20, max_depth=5):
	self.error_link_list = []
	self.is_start = True
	self.worker = worker
	self.max_depth = max_depth
	file_name = self.get_file_name(link) + "-" + "link_all.txt"
	if not path.exists(file_name):
	self.link_all = list(set(self.parse_link(link, [])))
	with open(file_name, 'wb') as fp:
	pickle.dump(self.link_all, fp)
	else:
	with open(file_name, 'rb') as fp:
	self.link_all = pickle.load(fp)

	def start(self):
	while len(self.error_link_list) != 0 or self.is_start is True:
	with ThreadPoolExecutor(max_workers=self.worker) as executor:
	for idx, link in enumerate(self.link_all):
	try:
	future = self.submit_task(executor, link)
	except Exception as e:
	log.error(f"Error procesing link \| error: {str(e)}")
	continue
	with open('processed.txt', 'wb') as fp:
	pickle.dump(self.link_all, fp)

	log.info(f"Result first attempt{future.result()}")
	log.info(f"Processing leftover error {len(self.error_link_list)} task")
	# processing leftover error
	if len(self.error_link_list) == 0:
	log.info(f"All task done")
	return
	with ThreadPoolExecutor(max_workers=self.worker) as executor:
	for idx, link in enumerate(self.error_link_list):
	try:
	future = self.submit_task(executor, link)
	except Exception as e:
	log.error(f"Error procesing link \| error: {str(e)}")
	continue
	self.error_link_list.pop(idx)
	with open('error_link_list.txt', 'wb') as fp:
	pickle.dump(self.error_link_list, fp)
	is_start = False
	log.info(future.result())

	def submit_task(self, executor, link):
	path = os.getcwd() + "/" + urlparse(link).path.split("/")[-1]
	future = executor.submit(self.download, self.error_link_list, link, path)
	return future

	@staticmethod
	def download(error_list, link, filelocation):
	try:
	log.info(f"Downloading {link}")
	r = requests.get(link, stream=True)
	# processing file chunk by chunk, since we don't want to
	# process them all at once and using all memory
	with open(filelocation, 'wb') as f:
	log.info(f"Processing chunk for {link}")
	for chunk in r.iter_content(1024):
	if chunk:
	f.write(chunk)
	log.info(f"{link} DONE")
	except FileExistsError as e:
	log.error(f"Error processing {link} \| error {str(e)}")
	pass
	except Exception as e:
	log.error(f"Error processing {link} \| error {str(e)}")
	error_list.append(link)

	def parse_link(self, open_directory_url, link_list, depth=0):
	"""Traverse link for open directory"""
	try:
	if depth == self.max_depth:
	return []
	log.info(f"Traversing level {depth}")
	page = requests.get(open_directory_url)
	data = page.text
	soup = BeautifulSoup(data, features="html.parser")
	if soup is None:
	return []

	# this is content MIME type from header,
	# extension = ("jpg", "wav", "mp4", "wmv")
	# extension = ("jpg", "wav", "mp4", "wmv", "doc", "txt", "rtf", "DOC", "ppt", "pdf", "xls")

	for link in soup.find_all('a'):
	page.headers.get("Content-type")
	try:
	url = open_directory_url + link.get('href')
	except:
	continue
	if url is None:
	continue
	elif "?C=" in url:
	continue
	# elif url.endswith(extension):
	elif len(url.split(".")[-1]) <= 5: # get file extension, if more than 5, less likely a file extension
	link_list.append(url)
	continue
	else:
	log.info(f"Traversing {url}")
	result = self.parse_link(url, link_list, depth=depth+1)
	link_list.extend(result)

	log.debug(link_list)
	return link_list

	except Exception as e:
	log.error(f"Error parsing \| error: {str(e)}")
	return []

	@staticmethod
	def get_file_name(url):
	path = urlparse(url).path
	path = path.split('/')[-2]
	return path