Skip to content

Instantly share code, notes, and snippets.

@tiffon
Created April 4, 2015 08:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tiffon/221e8f65abf010832534 to your computer and use it in GitHub Desktop.
Save tiffon/221e8f65abf010832534 to your computer and use it in GitHub Desktop.
Beautiful Soup 4 code that scrapes a few questions off stackoverflow.com
#! /usr/bin/env python
import sys
import time
import urllib2
from bs4 import BeautifulSoup
HOMEPAGE_URL = 'http://stackoverflow.com/'
TAG_PAGE_URL = 'http://stackoverflow.com/questions/tagged/web-scraping?sort=newest&pageSize=10'
def get_page(url):
time.sleep(1.5)
print 'url:', url
try:
return urllib2.urlopen(url).read()
except urllib2.HTTPError as err:
print 'urllib2.HTTPError: \n\t url: %s \n\t err: %s' % (url, err)
def get_questions():
html = get_page(TAG_PAGE_URL)
soup = BeautifulSoup(html)
questions = []
for question in soup.select('.question-summary'):
q = {}
# activity metrics
q['attn'] = {
'votes': question.find(class_='votes').get_text(),
'answers': question.find(class_='status').get_text(),
'views': question.find(class_='views').get_text()
}
# question info
summary = question.find(class_='summary')
post_details = summary.h3.a
q['post_url'] = post_details['href']
q['title'] = post_details.get_text()
q['excerpt'] = summary.find(class_='excerpt').get_text()
q['tags'] = [
{
'url': a['href'],
'name': a.get_text()
}
for a in summary.select('.tags a')
]
questions.append(q)
return questions
def get_poster_details(elm):
user_info = elm.find(class_='user-details').a
return {
'user_login': None if not user_info else user_info.get_text(),
'user_url': None if not user_info else user_info['href'],
'date': elm.find(class_='user-action-time').span['title']
}
def get_comment_activity(elm):
user_link = elm.find(class_='comment-user')
return {
'user_login': user_link.get_text(),
'user_url': user_link['href'],
'date': elm.find(class_='comment-date').span['title']
}
def get_question_activity(url):
rv = {}
html = get_page(url)
soup = BeautifulSoup(html)
# question poster and comments
question = soup.find(class_='question')
poster = question.select('.postcell .post-signature.owner')[0]
rv['question'] = {
'asked_by': get_poster_details(poster),
'comments': [
get_comment_activity(elm)
for elm in question.select('.comments .comment-body')
]
}
answers = []
for answer in soup.select('#answers .answer'):
poster = answer.select('.answercell .user-info')[-1]
answers.append({
'answered_by': get_poster_details(poster),
'comments': [
get_comment_activity(elm)
for elm in answer.select('.comments .comment-body')
]
})
rv['answers'] = answers
return rv
def get_data():
questions = get_questions()
if not questions:
return
for q in questions:
q['activity'] = get_question_activity(HOMEPAGE_URL + q['post_url'])
return questions
# misc code to show the downloaded data, should be the same between both samples
if __name__ == '__main__':
data = get_data()
if not data:
print 'Problem downloading stackover flow page: %r' % data
else:
try:
import simplejson as json
except ImportError:
import json
if len(sys.argv) > 1:
with open(sys.argv[-1], 'w') as f:
json.dump(data, f, indent=4)
else:
print json.dumps(data, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment