Created
June 6, 2016 19:33
-
-
Save durcana/618bd2dec9948ef87d9516e00c13cb1a to your computer and use it in GitHub Desktop.
Wiki Crawler Answers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8a. 94.6% | |
8b. {9: 78, 10: 52, 12: 50, 11: 45, 13: 42, 17: 35, 16: 34, 15: 28, 18: 28, | |
14: 26, 8: 21, 19: 14, 20: 10, 6: 3, 21: 3, 22: 2, 23: 2} | |
8c. To reduce the number of http requests, we can check if the current page in the search has | |
already been searched in a previous path. The search path of the same page will always give | |
the same result, and thus does not need to be repeated. Once that page is in a new path, | |
we can just add the path length of the previous search to the length of the current search | |
up to that page. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
from collections import Counter | |
import re | |
import requests | |
import sys | |
URL = 'http://www.wikipedia.org' | |
WIKI = '/wiki/' | |
def wiki_crawler(): | |
# random search for wikipedia pages | |
start_search = "Special:Random" | |
start_search = WIKI + start_search | |
url_end = start_search | |
path = [] | |
path.append(url_end) | |
while url_end[6:] != 'Philosophy': | |
if len(path) == 50: | |
break | |
print url_end | |
r = requests.get(URL + str(url_end)) | |
soup = bs(r.content, "html.parser") | |
content = soup.find('div', {'id': 'mw-content-text'}) | |
# remove specific tags that can have an unwanted link. | |
for tag in content.find_all(['span', 'small', 'sup,', 'i', 'table']): | |
tag.replace_with("") | |
links = content.find_all('p') | |
# use regular expression to remove everything within parentheses. | |
links = re.sub(r' \(.*?\)', '', str(links)) | |
# search for the first link. | |
paragraphs = bs(links, 'html.parser') | |
href = paragraphs.find(href=re.compile('^/wiki/')) | |
while href is None: | |
links = content.find_next_sibling("p") | |
# exeption | |
if paragraphs is None: | |
if content.ul is not None: | |
href = paragraphs.find(href=re.compile('^/wiki/')) | |
# no links in any usable paragraph | |
if href is None: | |
print("No links in paragraphs.") | |
# This will save the path with length of over 50, and will be discarded in main. | |
return path*50 | |
paragraphs = bs(links, 'html.parser') | |
href = paragraphs.find(href=re.compile('^/wiki/')) | |
# save the link and add it to the path | |
url_end = href.get('href') | |
path.append(url_end) | |
return path | |
def main(): | |
success_paths = [] | |
for i in range(500): | |
print i | |
path = wiki_crawler() | |
if len(path) < 50: | |
success_paths.append(path) | |
# Create the counter for distribution of path lengths. | |
path_lengths = [len(url_end) for url_end in success_paths] | |
dist = Counter(path_lengths) | |
print "Answer to 8a:" | |
print str(float((len(success_paths) / 500.00) * 100)) + '%' | |
print "Answer to 8b:" | |
print dist | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment