Skip to content

Instantly share code, notes, and snippets.

@mzmmoazam
Created October 28, 2017 14:16
Show Gist options
  • Save mzmmoazam/cd35321e95aad0edb8b1f429eeca11ed to your computer and use it in GitHub Desktop.
Save mzmmoazam/cd35321e95aad0edb8b1f429eeca11ed to your computer and use it in GitHub Desktop.
Can generate the sitemap and also get the links on any specific depth (layer).
import requests
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup
all_urls = set()
class url_tree():
def __init__(self, base_url,page_no=1 ,children=None):
self.url = base_url
self.page_no =page_no
self.children = []
if children is not None :
# print(self.url,children)
for child in children:
if child not in all_urls:
all_urls.add(child)
self.add_child(url_tree(child,self.page_no+1))
if to_level >= self.page_no:
self.get_children()
# print(self.url,self.children)
def add_child(self, node):
''' adds children to the its parent node '''
assert isinstance(node, url_tree)
self.children.append(node)
def get_layer(self,no):
''' Returns the urls at certain depth of the root url '''
if self.page_no != no:
for child in self.children:
yield child.get_layer(no)
# return child.get_layer(no)
else:
yield self.url
def get_children(self):
# url_tree[page_no][url] = list()
try :
page = requests.get(self.url)
except Exception as e:
print (e) # when the url has got some problem
return
print('scraped -> ',self.url)
soup = BeautifulSoup(page.content, 'lxml')
for link in soup.find_all('a', href=True):
llink = self.true_url(self.url, link['href'])
# print(llink, link['href'])
if llink and llink not in all_urls:
all_urls.add(llink)
self.add_child(url_tree(llink,page_no=self.page_no+1))
def true_url(self,url,x,extensive_check=False):
''' returns url if it matches some basic validations else None'''
try:
result = urlparse(x)
if all([result.scheme, result.netloc, result.path]) :
return x
elif any([result.scheme, result.netloc, result.path]) :
if extensive_check:
_ = requests.get(urljoin(url,x))
return urljoin(url,x) if _.status_code == 200 else False
else:
return urljoin(url,x)
else:
return False
except:
return False
def all_urls(self,send=False):
for url in all_urls:
print (url)
if send: return all_urls
def sitemap(self,depth):
if self.page_no <= depth:
for child in self.children:
print(self.url,'->' ,child)
for child in self.children:
child.sitemap(depth)
return ""
def __str__(self):
return self.url
def __repr__(self):
return self.url
if __name__ == '__main__':
url_ = "https://github.com/Cartman720/PySitemap" # Enter the base url over here
to_level = 1 # no of layers deep the site has to be scraped ; \\ to_level = layer + 1 \\, so , we can get two layer depth
url_tree = url_tree(url_) # initiating the url_tree with the url
print([''.join(i) for i in url.get_layer(2)]) # get the links at depth 2
print(url_tree.sitemap(depth=2)) # sitemap till depth = 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment