Skip to content

Instantly share code, notes, and snippets.

@rizumu
Created October 17, 2017 02:02
Show Gist options
  • Save rizumu/f77c171c0ed073228b18884d329934f0 to your computer and use it in GitHub Desktop.
Save rizumu/f77c171c0ed073228b18884d329934f0 to your computer and use it in GitHub Desktop.
A simplistic website crawler that extracts links recursively using breath first traversal
#!/usr/bin/env python
# Assumes the BeautifulSoup4 and Requests libraries are installed.
# pip install bs4 requests
import requests
from bs4 import BeautifulSoup
from requests.compat import urljoin
SITEURL = 'https://google.com'
CRAWL_QUEUE = set()
CRAWLED = []
def get_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
links = set([line['href'] for line in soup.find_all('a')
if line.get('href') and not line['href'].startswith('http')])
for link in links.copy():
if link == '/' or link.startswith('//') or not link.startswith('/'):
links.remove(link)
return [urljoin(SITEURL, link) for link in links]
def webcrawler(url):
CRAWLED.append(url)
CRAWL_QUEUE.update([l for l in get_links(url) if l not in CRAWLED])
try:
link = CRAWL_QUEUE.pop()
except KeyError:
return
webcrawler(link)
if __name__ == '__main__':
webcrawler(SITEURL)
for link in CRAWLED:
print(link)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment