Skip to content

Instantly share code, notes, and snippets.

@dmpayton
Created July 27, 2016 03:11
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dmpayton/e8cf14c569cacaf3a6d6212db2238cce to your computer and use it in GitHub Desktop.
Save dmpayton/e8cf14c569cacaf3a6d6212db2238cce to your computer and use it in GitHub Desktop.
FresnoPython web crawler demo
1. `mkvirtualenv philosophy -p /usr/bin/python3.5`
2. Install dependencies:
`pip install requests beautifulsoup4`
3a. run `python crawler.py`
3b. run `python crawler.py <article_slug>`
4. Profit^wPhilosophy
#!/usr/bin/env python
# https://xkcd.com/903/ (2011-05-25)
# https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy
import re
import sys
from itertools import chain
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
crawled = []
def extract_paragraphs(soup):
# Remove italic text and tables
invalid = chain(
soup.find_all('i'),
soup.find_all('div', class_='hatnote'),
soup.find_all('table'),
)
for item in invalid:
item.extract()
# Loop through paragraphs, removing text in ()'s
# and yielding cleaned content.
for paragraph in chain(soup.find_all('p'), soup.find_all('li')):
# Track how many ()s we're in
paren_count = 0
# A flag to skip ()'s inside tags (e.g., a[href])
skip = False
# Keep track of cleaned content
cleaned = ''
for char in str(paragraph):
# Keep track of when we enter and exit tags
if char == '<':
skip = True
elif char == '>':
skip = False
if skip is False:
# Track how deeply nested in ()'s we are
if char == '(':
paren_count += 1
elif char == ')':
paren_count -= 1
continue
# If we're not inside ()'s, the character is clean
if paren_count == 0:
cleaned += char
yield BeautifulSoup(cleaned, 'html.parser')
def crawl(page, n=0):
if page == quote('/wiki/Philosophy'):
# We've arrived
print('{0}. !! Philosophy !!'.format(n))
return
# Track what pages we've crawled so we can detect infinite loops
# /wiki/Net_register_tonnage -> /wiki/Gross_register_tonnage
if page in crawled:
print('{0}. !! Infinite loop detected !!'.format(n))
print(page)
return
else:
crawled.append(page)
# Get the page content
url = 'https://en.wikipedia.org{0}'.format(page)
response = requests.get(url)
html = response.content
# Parse the html
soup = BeautifulSoup(html, 'html.parser')
# Print the title of the page
title = soup.find('h1', id='firstHeading')
article = soup.find(id='mw-content-text')
print('{0}. {1}'.format(n, title.text))
# Iterate over the paragraphs until we find one where the first
# link is another wikipedia page, then crawl that page.
anchor = None
for para in extract_paragraphs(article):
anchor = para.find('a', href=re.compile(r'^/wiki/[^\:]+$'))
if anchor is not None:
next = dict(anchor.attrs)['href']
return crawl(next, n + 1)
if anchor is None:
print('The trail went cold. :(')
if __name__ == '__main__':
try:
page = '/wiki/{0}'.format(sys.argv[1])
except IndexError:
page = '/wiki/Special:Random'
crawl(page)
@MrCsabaToth
Copy link

Interesting! French, Spanish and Hungarian languages lead to a lot of infinite loops, so apparently only English articles are well formed enough to not get into infinite loops. With urllib the script support those languages which have accent in the translated version of philosophy like filozófia. Pretty cool.

@MrCsabaToth
Copy link

Question: does this script use any features of Python 3 which wouldn't work with 2.7?

@dmpayton
Copy link
Author

@MrCsabaToth The only change you should need for Python 2 is to change line 9 to from urllib import quote.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment