Skip to content

Instantly share code, notes, and snippets.

@sriganesh
Created April 19, 2019 04:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sriganesh/71ec4aa1b71de7e79d12e03463a5cc03 to your computer and use it in GitHub Desktop.
Save sriganesh/71ec4aa1b71de7e79d12e03463a5cc03 to your computer and use it in GitHub Desktop.
scrape site python
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from urllib.parse import urlparse
from collections import deque
import re
url = "https://scrapethissite.com"
# a queue of urls to be crawled
new_urls = deque([url])
# a set of urls that we have already been processed
processed_urls = set()
# a set of domains inside the target website
local_urls = set()
# a set of domains outside the target website
foreign_urls = set()
# a set of broken urls
broken_urls = set()
# process urls one by one until we exhaust the queue
while len(new_urls):
# move next url from the queue to the set of processed urls
url = new_urls.popleft()
processed_urls.add(url)
# get url's content
print("Processing %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
# add broken urls to it's own set, then continue
broken_urls.add(url)
continue
# extract base url to resolve relative links
parts = urlsplit(url)
base = "{0.netloc}".format(parts)
strip_base = base.replace("www.", "")
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url
# create a beutiful soup for the html document
soup = BeautifulSoup(response.text, "lxml")
for link in soup.find_all('a'):
# extract link url from the anchor
anchor = link.attrs["href"] if "href" in link.attrs else ''
if anchor.startswith('/'):
local_link = base_url + anchor
local_urls.add(local_link)
elif strip_base in anchor:
local_urls.add(anchor)
elif not anchor.startswith('http'):
local_link = path + anchor
local_urls.add(local_link)
else:
foreign_urls.add(anchor)
for i in local_urls:
if not i in new_urls and not i in processed_urls:
new_urls.append(i)
print(processed_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment