rizumu/sitecrawler.py

## sitecrawler.py
#!/usr/bin/env python

# Assumes the BeautifulSoup4 and Requests libraries are installed.
# pip install bs4 requests


import requests

from bs4 import BeautifulSoup
from requests.compat import urljoin


SITEURL = 'https://google.com'

CRAWL_QUEUE = set()
CRAWLED = []


def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    links = set([line['href'] for line in soup.find_all('a')
                 if line.get('href') and not line['href'].startswith('http')])

    for link in links.copy():
        if link == '/' or link.startswith('//') or not link.startswith('/'):
            links.remove(link)

    return [urljoin(SITEURL, link) for link in links]


def webcrawler(url):
    CRAWLED.append(url)
    CRAWL_QUEUE.update([l for l in get_links(url) if l not in CRAWLED])
    try:
        link = CRAWL_QUEUE.pop()
    except KeyError:
        return
    webcrawler(link)


if __name__ == '__main__':
    webcrawler(SITEURL)
    for link in CRAWLED:
        print(link)
	#!/usr/bin/env python

	# Assumes the BeautifulSoup4 and Requests libraries are installed.
	# pip install bs4 requests


	import requests

	from bs4 import BeautifulSoup
	from requests.compat import urljoin


	SITEURL = 'https://google.com'

	CRAWL_QUEUE = set()
	CRAWLED = []


	def get_links(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'lxml')
	links = set([line['href'] for line in soup.find_all('a')
	if line.get('href') and not line['href'].startswith('http')])

	for link in links.copy():
	if link == '/' or link.startswith('//') or not link.startswith('/'):
	links.remove(link)

	return [urljoin(SITEURL, link) for link in links]


	def webcrawler(url):
	CRAWLED.append(url)
	CRAWL_QUEUE.update([l for l in get_links(url) if l not in CRAWLED])
	try:
	link = CRAWL_QUEUE.pop()
	except KeyError:
	return
	webcrawler(link)


	if __name__ == '__main__':
	webcrawler(SITEURL)
	for link in CRAWLED:
	print(link)