Created
April 4, 2015 08:39
-
-
Save tiffon/221e8f65abf010832534 to your computer and use it in GitHub Desktop.
Beautiful Soup 4 code that scrapes a few questions off stackoverflow.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import time | |
import urllib2 | |
from bs4 import BeautifulSoup | |
HOMEPAGE_URL = 'http://stackoverflow.com/' | |
TAG_PAGE_URL = 'http://stackoverflow.com/questions/tagged/web-scraping?sort=newest&pageSize=10' | |
def get_page(url): | |
time.sleep(1.5) | |
print 'url:', url | |
try: | |
return urllib2.urlopen(url).read() | |
except urllib2.HTTPError as err: | |
print 'urllib2.HTTPError: \n\t url: %s \n\t err: %s' % (url, err) | |
def get_questions(): | |
html = get_page(TAG_PAGE_URL) | |
soup = BeautifulSoup(html) | |
questions = [] | |
for question in soup.select('.question-summary'): | |
q = {} | |
# activity metrics | |
q['attn'] = { | |
'votes': question.find(class_='votes').get_text(), | |
'answers': question.find(class_='status').get_text(), | |
'views': question.find(class_='views').get_text() | |
} | |
# question info | |
summary = question.find(class_='summary') | |
post_details = summary.h3.a | |
q['post_url'] = post_details['href'] | |
q['title'] = post_details.get_text() | |
q['excerpt'] = summary.find(class_='excerpt').get_text() | |
q['tags'] = [ | |
{ | |
'url': a['href'], | |
'name': a.get_text() | |
} | |
for a in summary.select('.tags a') | |
] | |
questions.append(q) | |
return questions | |
def get_poster_details(elm): | |
user_info = elm.find(class_='user-details').a | |
return { | |
'user_login': None if not user_info else user_info.get_text(), | |
'user_url': None if not user_info else user_info['href'], | |
'date': elm.find(class_='user-action-time').span['title'] | |
} | |
def get_comment_activity(elm): | |
user_link = elm.find(class_='comment-user') | |
return { | |
'user_login': user_link.get_text(), | |
'user_url': user_link['href'], | |
'date': elm.find(class_='comment-date').span['title'] | |
} | |
def get_question_activity(url): | |
rv = {} | |
html = get_page(url) | |
soup = BeautifulSoup(html) | |
# question poster and comments | |
question = soup.find(class_='question') | |
poster = question.select('.postcell .post-signature.owner')[0] | |
rv['question'] = { | |
'asked_by': get_poster_details(poster), | |
'comments': [ | |
get_comment_activity(elm) | |
for elm in question.select('.comments .comment-body') | |
] | |
} | |
answers = [] | |
for answer in soup.select('#answers .answer'): | |
poster = answer.select('.answercell .user-info')[-1] | |
answers.append({ | |
'answered_by': get_poster_details(poster), | |
'comments': [ | |
get_comment_activity(elm) | |
for elm in answer.select('.comments .comment-body') | |
] | |
}) | |
rv['answers'] = answers | |
return rv | |
def get_data(): | |
questions = get_questions() | |
if not questions: | |
return | |
for q in questions: | |
q['activity'] = get_question_activity(HOMEPAGE_URL + q['post_url']) | |
return questions | |
# misc code to show the downloaded data, should be the same between both samples | |
if __name__ == '__main__': | |
data = get_data() | |
if not data: | |
print 'Problem downloading stackover flow page: %r' % data | |
else: | |
try: | |
import simplejson as json | |
except ImportError: | |
import json | |
if len(sys.argv) > 1: | |
with open(sys.argv[-1], 'w') as f: | |
json.dump(data, f, indent=4) | |
else: | |
print json.dumps(data, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment