durcana/answers.txt

## answers.txt
8a.    94.6%

8b.    {9: 78, 10: 52, 12: 50, 11: 45, 13: 42, 17: 35, 16: 34, 15: 28, 18: 28,
        14: 26, 8: 21, 19: 14, 20: 10, 6: 3, 21: 3, 22: 2, 23: 2}

8c.    To reduce the number of http requests, we can check if the current page in the search has
       already been searched in a previous path. The search path of the same page will always give
       the same result, and thus does not need to be repeated. Once that page is in a new path,
       we can just add the path length of the previous search to the length of the current search
       up to that page.

## wikipedia_crawler.py
from bs4 import BeautifulSoup as bs
from collections import Counter
import re
import requests
import sys

URL = 'http://www.wikipedia.org'
WIKI = '/wiki/'


def wiki_crawler():
    # random search for wikipedia pages
    start_search = "Special:Random"
    start_search = WIKI + start_search

    url_end = start_search
    path = []
    path.append(url_end)

    while url_end[6:] != 'Philosophy':
        if len(path) == 50:
            break

        print url_end

        r = requests.get(URL + str(url_end))
        soup = bs(r.content, "html.parser")

        content = soup.find('div', {'id': 'mw-content-text'})
        # remove specific tags that can have an unwanted link.
        for tag in content.find_all(['span', 'small', 'sup,', 'i', 'table']):
            tag.replace_with("")
        links = content.find_all('p')

        # use regular expression to remove everything within parentheses.
        links = re.sub(r' \(.*?\)', '', str(links))

        # search for the first link.
        paragraphs = bs(links, 'html.parser')
        href = paragraphs.find(href=re.compile('^/wiki/'))
        while href is None:
            links = content.find_next_sibling("p")

            # exeption
            if paragraphs is None:
                if content.ul is not None:
                    href = paragraphs.find(href=re.compile('^/wiki/'))
            # no links in any usable paragraph
            if href is None:
                print("No links in paragraphs.")
                # This will save the path with length of over 50, and will be discarded in main.
                return path*50

            paragraphs = bs(links, 'html.parser')
            href = paragraphs.find(href=re.compile('^/wiki/'))

        # save the link and add it to the path
        url_end = href.get('href')
        path.append(url_end)

    return path


def main():
    success_paths = []

    for i in range(500):
        print i
        path = wiki_crawler()
        if len(path) < 50:
            success_paths.append(path)

    # Create the counter for distribution of path lengths.
    path_lengths = [len(url_end) for url_end in success_paths]
    dist = Counter(path_lengths)

    print "Answer to 8a:"
    print str(float((len(success_paths) / 500.00) * 100)) + '%'
    print "Answer to 8b:"
    print dist


if __name__ == '__main__':
    main()
	8a. 94.6%

	8b. {9: 78, 10: 52, 12: 50, 11: 45, 13: 42, 17: 35, 16: 34, 15: 28, 18: 28,
	14: 26, 8: 21, 19: 14, 20: 10, 6: 3, 21: 3, 22: 2, 23: 2}

	8c. To reduce the number of http requests, we can check if the current page in the search has
	already been searched in a previous path. The search path of the same page will always give
	the same result, and thus does not need to be repeated. Once that page is in a new path,
	we can just add the path length of the previous search to the length of the current search
	up to that page.
	from bs4 import BeautifulSoup as bs
	from collections import Counter
	import re
	import requests
	import sys

	URL = 'http://www.wikipedia.org'
	WIKI = '/wiki/'


	def wiki_crawler():
	# random search for wikipedia pages
	start_search = "Special:Random"
	start_search = WIKI + start_search

	url_end = start_search
	path = []
	path.append(url_end)

	while url_end[6:] != 'Philosophy':
	if len(path) == 50:
	break

	print url_end

	r = requests.get(URL + str(url_end))
	soup = bs(r.content, "html.parser")

	content = soup.find('div', {'id': 'mw-content-text'})
	# remove specific tags that can have an unwanted link.
	for tag in content.find_all(['span', 'small', 'sup,', 'i', 'table']):
	tag.replace_with("")
	links = content.find_all('p')

	# use regular expression to remove everything within parentheses.
	links = re.sub(r' \(.*?\)', '', str(links))

	# search for the first link.
	paragraphs = bs(links, 'html.parser')
	href = paragraphs.find(href=re.compile('^/wiki/'))
	while href is None:
	links = content.find_next_sibling("p")

	# exeption
	if paragraphs is None:
	if content.ul is not None:
	href = paragraphs.find(href=re.compile('^/wiki/'))
	# no links in any usable paragraph
	if href is None:
	print("No links in paragraphs.")
	# This will save the path with length of over 50, and will be discarded in main.
	return path*50

	paragraphs = bs(links, 'html.parser')
	href = paragraphs.find(href=re.compile('^/wiki/'))

	# save the link and add it to the path
	url_end = href.get('href')
	path.append(url_end)

	return path


	def main():
	success_paths = []

	for i in range(500):
	print i
	path = wiki_crawler()
	if len(path) < 50:
	success_paths.append(path)

	# Create the counter for distribution of path lengths.
	path_lengths = [len(url_end) for url_end in success_paths]
	dist = Counter(path_lengths)

	print "Answer to 8a:"
	print str(float((len(success_paths) / 500.00) * 100)) + '%'
	print "Answer to 8b:"
	print dist


	if __name__ == '__main__':
	main()