mzmmoazam/url_tree.py

## url_tree.py
import requests
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup

all_urls = set()

class url_tree():
    def __init__(self, base_url,page_no=1 ,children=None):
        self.url = base_url
        self.page_no =page_no
        self.children = []
        if children is not None :
            # print(self.url,children)
            for child in children:
                if child not in all_urls:
                    all_urls.add(child)
                    self.add_child(url_tree(child,self.page_no+1))
        if to_level >= self.page_no:
            self.get_children()
        # print(self.url,self.children)

    def add_child(self, node):
        ''' adds children to the its parent node '''
        assert isinstance(node, url_tree)
        self.children.append(node)

    def get_layer(self,no):
        ''' Returns the urls at certain depth of the root url '''
        if self.page_no != no:
            for child in self.children:
                yield child.get_layer(no)
                # return child.get_layer(no)
        else:
            yield self.url

    def get_children(self):
        # url_tree[page_no][url] = list()
        try :
            page = requests.get(self.url)
        except Exception as e:
            print (e) # when the url has got some problem
            return
        print('scraped -> ',self.url)
        soup = BeautifulSoup(page.content, 'lxml')
        for link in soup.find_all('a', href=True):
            llink = self.true_url(self.url, link['href'])
            # print(llink, link['href'])
            if llink and llink not in all_urls:
                all_urls.add(llink)
                self.add_child(url_tree(llink,page_no=self.page_no+1))

    def true_url(self,url,x,extensive_check=False):
        ''' returns url if it matches some basic validations else None'''
        try:
            result = urlparse(x)
            if all([result.scheme, result.netloc, result.path]) :
                return x
            elif any([result.scheme, result.netloc, result.path]) :
                if extensive_check:
                    _ = requests.get(urljoin(url,x))
                    return urljoin(url,x) if _.status_code == 200 else False
                else:
                    return urljoin(url,x)
            else:
                return False
        except:
            return False

    def all_urls(self,send=False):
        for url in all_urls:
            print (url)
        if send: return all_urls

    def sitemap(self,depth):
        if self.page_no <= depth:
            for child in self.children:
                print(self.url,'->' ,child)
            for child in self.children:
                child.sitemap(depth)
            return ""

    def __str__(self):
        return self.url

    def __repr__(self):
        return self.url


if __name__ == '__main__':
    url_ = "https://github.com/Cartman720/PySitemap" # Enter the base url over here

    to_level = 1 # no of layers deep the site has to be scraped ; \\ to_level = layer + 1 \\, so , we can get two layer depth

    url_tree = url_tree(url_) # initiating the url_tree with the url

    print([''.join(i) for i in url.get_layer(2)])  # get the links at depth 2
    print(url_tree.sitemap(depth=2)) # sitemap till depth = 2
	import requests
	from urllib.parse import urlparse,urljoin
	from bs4 import BeautifulSoup

	all_urls = set()

	class url_tree():
	def __init__(self, base_url,page_no=1 ,children=None):
	self.url = base_url
	self.page_no =page_no
	self.children = []
	if children is not None :
	# print(self.url,children)
	for child in children:
	if child not in all_urls:
	all_urls.add(child)
	self.add_child(url_tree(child,self.page_no+1))
	if to_level >= self.page_no:
	self.get_children()
	# print(self.url,self.children)

	def add_child(self, node):
	''' adds children to the its parent node '''
	assert isinstance(node, url_tree)
	self.children.append(node)

	def get_layer(self,no):
	''' Returns the urls at certain depth of the root url '''
	if self.page_no != no:
	for child in self.children:
	yield child.get_layer(no)
	# return child.get_layer(no)
	else:
	yield self.url

	def get_children(self):
	# url_tree[page_no][url] = list()
	try :
	page = requests.get(self.url)
	except Exception as e:
	print (e) # when the url has got some problem
	return
	print('scraped -> ',self.url)
	soup = BeautifulSoup(page.content, 'lxml')
	for link in soup.find_all('a', href=True):
	llink = self.true_url(self.url, link['href'])
	# print(llink, link['href'])
	if llink and llink not in all_urls:
	all_urls.add(llink)
	self.add_child(url_tree(llink,page_no=self.page_no+1))

	def true_url(self,url,x,extensive_check=False):
	''' returns url if it matches some basic validations else None'''
	try:
	result = urlparse(x)
	if all([result.scheme, result.netloc, result.path]) :
	return x
	elif any([result.scheme, result.netloc, result.path]) :
	if extensive_check:
	_ = requests.get(urljoin(url,x))
	return urljoin(url,x) if _.status_code == 200 else False
	else:
	return urljoin(url,x)
	else:
	return False
	except:
	return False

	def all_urls(self,send=False):
	for url in all_urls:
	print (url)
	if send: return all_urls

	def sitemap(self,depth):
	if self.page_no <= depth:
	for child in self.children:
	print(self.url,'->' ,child)
	for child in self.children:
	child.sitemap(depth)
	return ""

	def __str__(self):
	return self.url

	def __repr__(self):
	return self.url




	if __name__ == '__main__':
	url_ = "https://github.com/Cartman720/PySitemap" # Enter the base url over here

	to_level = 1 # no of layers deep the site has to be scraped ; \\ to_level = layer + 1 \\, so , we can get two layer depth

	url_tree = url_tree(url_) # initiating the url_tree with the url

	print([''.join(i) for i in url.get_layer(2)]) # get the links at depth 2
	print(url_tree.sitemap(depth=2)) # sitemap till depth = 2