Skip to content

Instantly share code, notes, and snippets.

@phi10s
Created April 19, 2019 14:41
Show Gist options
  • Save phi10s/859fdaa3464897e6677cc98c9724b852 to your computer and use it in GitHub Desktop.
Save phi10s/859fdaa3464897e6677cc98c9724b852 to your computer and use it in GitHub Desktop.
This simple Python script charts a path from any Wikipedia entry to the philosophy page. Like Six Degrees of Kevin Bacon, but for philosophy nerds.
#!/usr/bin/python
__author__="phi10s"
'''It is hypothesized that if you recursively click the first link
in the main body of any Wikipedia entry, you eventually will reach
the philosophy entry. This script maps the path from an arbitrary
entry to the Philosophy entry, and notes the number of hops required.
It's like Six Degrees of Kevin Bacon, but for philosophy nerds.
-phi10s
'''
import requests
from bs4 import BeautifulSoup
import sys
query = sys.argv[1]
wiki_base_url = "https://en.wikipedia.org/"
initial_url = wiki_base_url + "wiki/" + query.strip()
loglist = []
phil_dist = 0
removes = 0
print(initial_url)
# Recursive, because it's more philosophically interesting than iteration
def crawl(url, linknum):
global phil_dist
global removes
if linknum == 0:
phil_dist += 1
repsonse = requests.get(url)
loglist.append(url)
soup = BeautifulSoup(repsonse.content, "lxml")
page_title = soup.select('#firstHeading')[0].text.encode('utf-8')
if page_title == "Philosophy":
print("Philosophy!")
print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1)))
exit(0)
text = soup.select('#mw-content-text')
# atags = text[0].select('p a')
paragraphs = text[0].select('p')
paragraph = paragraphs[0]
index = 0
while len(paragraph.text) < 200:
index += 1
paragraph = paragraphs[index]
atags = paragraph.select('a')
# print(atags[0])
hrefs = [atag.get('href') for atag in atags]
'''The clunky and inelegant bit of code below is an attempt to filter out
links in the etymological section, as this is about the relation of concepts
in the main body. Wiki page HTML is not ideal to parse this in an elegant
manner, but there is probably a better way to do this.'''
links = [href for href in hrefs if href is not None and "/wiki/" in href \
and ":" not in href and "Greek" not in href and "Latin" not in href \
and "English" not in href and "Literal_translation" not in href]
# print(links[0])
new_page_href = links[linknum]
new_page_url = wiki_base_url + new_page_href
print(new_page_url)
while new_page_url in loglist:
print("[-] Oh no, an infinite loop! Moving to next link.")
removes += 1
linknum += 1
crawl(url, linknum)
#print(page_title + "\n| %i" % phil_dist)
linknum = 0
crawl(new_page_url, linknum)
crawl(initial_url, 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment