phi10s/all_roads_to_philosophy.py

## all_roads_to_philosophy.py
#!/usr/bin/python

__author__="phi10s"

'''It is hypothesized that if you recursively click the first link
in the main body of any Wikipedia entry, you eventually will reach
the philosophy entry. This script maps the path from an arbitrary
entry to the Philosophy entry, and notes the number of hops required.
It's like Six Degrees of Kevin Bacon, but for philosophy nerds.

-phi10s
'''

import requests
from bs4 import BeautifulSoup
import sys

query = sys.argv[1]
wiki_base_url = "https://en.wikipedia.org/"
initial_url = wiki_base_url + "wiki/" + query.strip()
loglist = []
phil_dist = 0
removes = 0
print(initial_url)

# Recursive, because it's more philosophically interesting than iteration
def crawl(url, linknum):
	global phil_dist
	global removes
	if linknum == 0:
		phil_dist += 1
	repsonse = requests.get(url)
	loglist.append(url)
	soup = BeautifulSoup(repsonse.content, "lxml")
	page_title = soup.select('#firstHeading')[0].text.encode('utf-8')
	if page_title == "Philosophy":
		print("Philosophy!")
		print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1)))
		exit(0)
	text = soup.select('#mw-content-text')
	# atags = text[0].select('p a')
	paragraphs = text[0].select('p')
	paragraph = paragraphs[0]
	index = 0
	while len(paragraph.text) < 200:
		index += 1
		paragraph = paragraphs[index]
	atags = paragraph.select('a')
	# print(atags[0])
	hrefs = [atag.get('href') for atag in atags]
	'''The clunky and inelegant bit of code below is an attempt to filter out
	links in the etymological section, as this is about the relation of concepts
	in the main body. Wiki page HTML is not ideal to parse this in an elegant
	manner, but there is probably a better way to do this.'''
	links = [href for href in hrefs if href is not None and "/wiki/" in href \
		and ":" not in href and "Greek" not in href and "Latin" not in href \
		and "English" not in href and "Literal_translation" not in href]
	# print(links[0])
	new_page_href = links[linknum]
	new_page_url = wiki_base_url + new_page_href
	print(new_page_url)
	while new_page_url in loglist:
		print("[-] Oh no, an infinite loop! Moving to next link.")
		removes += 1
		linknum += 1
		crawl(url, linknum)
	#print(page_title + "\n| %i" % phil_dist)
	linknum = 0
	crawl(new_page_url, linknum)

crawl(initial_url, 0)
	#!/usr/bin/python

	__author__="phi10s"

	'''It is hypothesized that if you recursively click the first link
	in the main body of any Wikipedia entry, you eventually will reach
	the philosophy entry. This script maps the path from an arbitrary
	entry to the Philosophy entry, and notes the number of hops required.
	It's like Six Degrees of Kevin Bacon, but for philosophy nerds.

	-phi10s
	'''

	import requests
	from bs4 import BeautifulSoup
	import sys

	query = sys.argv[1]
	wiki_base_url = "https://en.wikipedia.org/"
	initial_url = wiki_base_url + "wiki/" + query.strip()
	loglist = []
	phil_dist = 0
	removes = 0
	print(initial_url)

	# Recursive, because it's more philosophically interesting than iteration
	def crawl(url, linknum):
	global phil_dist
	global removes
	if linknum == 0:
	phil_dist += 1
	repsonse = requests.get(url)
	loglist.append(url)
	soup = BeautifulSoup(repsonse.content, "lxml")
	page_title = soup.select('#firstHeading')[0].text.encode('utf-8')
	if page_title == "Philosophy":
	print("Philosophy!")
	print("\n[*] Distance from %s to philosophy is %i hops!\n" % (query,(phil_dist-1)))
	exit(0)
	text = soup.select('#mw-content-text')
	# atags = text[0].select('p a')
	paragraphs = text[0].select('p')
	paragraph = paragraphs[0]
	index = 0
	while len(paragraph.text) < 200:
	index += 1
	paragraph = paragraphs[index]
	atags = paragraph.select('a')
	# print(atags[0])
	hrefs = [atag.get('href') for atag in atags]
	'''The clunky and inelegant bit of code below is an attempt to filter out
	links in the etymological section, as this is about the relation of concepts
	in the main body. Wiki page HTML is not ideal to parse this in an elegant
	manner, but there is probably a better way to do this.'''
	links = [href for href in hrefs if href is not None and "/wiki/" in href \
	and ":" not in href and "Greek" not in href and "Latin" not in href \
	and "English" not in href and "Literal_translation" not in href]
	# print(links[0])
	new_page_href = links[linknum]
	new_page_url = wiki_base_url + new_page_href
	print(new_page_url)
	while new_page_url in loglist:
	print("[-] Oh no, an infinite loop! Moving to next link.")
	removes += 1
	linknum += 1
	crawl(url, linknum)
	#print(page_title + "\n\| %i" % phil_dist)
	linknum = 0
	crawl(new_page_url, linknum)

	crawl(initial_url, 0)