Skip to content

Instantly share code, notes, and snippets.

@dmpayton
Created July 27, 2016 03:11
Show Gist options
  • Save dmpayton/e8cf14c569cacaf3a6d6212db2238cce to your computer and use it in GitHub Desktop.
Save dmpayton/e8cf14c569cacaf3a6d6212db2238cce to your computer and use it in GitHub Desktop.
FresnoPython web crawler demo
1. `mkvirtualenv philosophy -p /usr/bin/python3.5`
2. Install dependencies:
`pip install requests beautifulsoup4`
3a. run `python crawler.py`
3b. run `python crawler.py <article_slug>`
4. Profit^wPhilosophy
#!/usr/bin/env python
# https://xkcd.com/903/ (2011-05-25)
# https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy
import re
import sys
from itertools import chain
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
crawled = []
def extract_paragraphs(soup):
# Remove italic text and tables
invalid = chain(
soup.find_all('i'),
soup.find_all('div', class_='hatnote'),
soup.find_all('table'),
)
for item in invalid:
item.extract()
# Loop through paragraphs, removing text in ()'s
# and yielding cleaned content.
for paragraph in chain(soup.find_all('p'), soup.find_all('li')):
# Track how many ()s we're in
paren_count = 0
# A flag to skip ()'s inside tags (e.g., a[href])
skip = False
# Keep track of cleaned content
cleaned = ''
for char in str(paragraph):
# Keep track of when we enter and exit tags
if char == '<':
skip = True
elif char == '>':
skip = False
if skip is False:
# Track how deeply nested in ()'s we are
if char == '(':
paren_count += 1
elif char == ')':
paren_count -= 1
continue
# If we're not inside ()'s, the character is clean
if paren_count == 0:
cleaned += char
yield BeautifulSoup(cleaned, 'html.parser')
def crawl(page, n=0):
if page == quote('/wiki/Philosophy'):
# We've arrived
print('{0}. !! Philosophy !!'.format(n))
return
# Track what pages we've crawled so we can detect infinite loops
# /wiki/Net_register_tonnage -> /wiki/Gross_register_tonnage
if page in crawled:
print('{0}. !! Infinite loop detected !!'.format(n))
print(page)
return
else:
crawled.append(page)
# Get the page content
url = 'https://en.wikipedia.org{0}'.format(page)
response = requests.get(url)
html = response.content
# Parse the html
soup = BeautifulSoup(html, 'html.parser')
# Print the title of the page
title = soup.find('h1', id='firstHeading')
article = soup.find(id='mw-content-text')
print('{0}. {1}'.format(n, title.text))
# Iterate over the paragraphs until we find one where the first
# link is another wikipedia page, then crawl that page.
anchor = None
for para in extract_paragraphs(article):
anchor = para.find('a', href=re.compile(r'^/wiki/[^\:]+$'))
if anchor is not None:
next = dict(anchor.attrs)['href']
return crawl(next, n + 1)
if anchor is None:
print('The trail went cold. :(')
if __name__ == '__main__':
try:
page = '/wiki/{0}'.format(sys.argv[1])
except IndexError:
page = '/wiki/Special:Random'
crawl(page)
@dmpayton
Copy link
Author

@MrCsabaToth The only change you should need for Python 2 is to change line 9 to from urllib import quote.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment