Skip to content

Instantly share code, notes, and snippets.

@durcana
Created June 6, 2016 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save durcana/618bd2dec9948ef87d9516e00c13cb1a to your computer and use it in GitHub Desktop.
Save durcana/618bd2dec9948ef87d9516e00c13cb1a to your computer and use it in GitHub Desktop.
Wiki Crawler Answers
8a. 94.6%
8b. {9: 78, 10: 52, 12: 50, 11: 45, 13: 42, 17: 35, 16: 34, 15: 28, 18: 28,
14: 26, 8: 21, 19: 14, 20: 10, 6: 3, 21: 3, 22: 2, 23: 2}
8c. To reduce the number of http requests, we can check if the current page in the search has
already been searched in a previous path. The search path of the same page will always give
the same result, and thus does not need to be repeated. Once that page is in a new path,
we can just add the path length of the previous search to the length of the current search
up to that page.
from bs4 import BeautifulSoup as bs
from collections import Counter
import re
import requests
import sys
URL = 'http://www.wikipedia.org'
WIKI = '/wiki/'
def wiki_crawler():
# random search for wikipedia pages
start_search = "Special:Random"
start_search = WIKI + start_search
url_end = start_search
path = []
path.append(url_end)
while url_end[6:] != 'Philosophy':
if len(path) == 50:
break
print url_end
r = requests.get(URL + str(url_end))
soup = bs(r.content, "html.parser")
content = soup.find('div', {'id': 'mw-content-text'})
# remove specific tags that can have an unwanted link.
for tag in content.find_all(['span', 'small', 'sup,', 'i', 'table']):
tag.replace_with("")
links = content.find_all('p')
# use regular expression to remove everything within parentheses.
links = re.sub(r' \(.*?\)', '', str(links))
# search for the first link.
paragraphs = bs(links, 'html.parser')
href = paragraphs.find(href=re.compile('^/wiki/'))
while href is None:
links = content.find_next_sibling("p")
# exeption
if paragraphs is None:
if content.ul is not None:
href = paragraphs.find(href=re.compile('^/wiki/'))
# no links in any usable paragraph
if href is None:
print("No links in paragraphs.")
# This will save the path with length of over 50, and will be discarded in main.
return path*50
paragraphs = bs(links, 'html.parser')
href = paragraphs.find(href=re.compile('^/wiki/'))
# save the link and add it to the path
url_end = href.get('href')
path.append(url_end)
return path
def main():
success_paths = []
for i in range(500):
print i
path = wiki_crawler()
if len(path) < 50:
success_paths.append(path)
# Create the counter for distribution of path lengths.
path_lengths = [len(url_end) for url_end in success_paths]
dist = Counter(path_lengths)
print "Answer to 8a:"
print str(float((len(success_paths) / 500.00) * 100)) + '%'
print "Answer to 8b:"
print dist
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment