Skip to content

Instantly share code, notes, and snippets.

@dbalan
Created July 14, 2016 13:37
Show Gist options
  • Save dbalan/ad726479227af10805e758e6244d341d to your computer and use it in GitHub Desktop.
Save dbalan/ad726479227af10805e758e6244d341d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from gevent import monkey
monkey.patch_all()
import sys
from gevent.pool import Pool
from bs4 import BeautifulSoup
import requests
from math import ceil
import json
URL = "https://stackoverflow.com/questions/tagged/python?page={}&sort=newest&pagesize=15"
URL = "https://stackoverflow.com/questions/tagged/python"
BASE_URL = "https://stackoverflow.com/"
def _concat(a, b):
tmp = []
tmp.extend(a)
tmp.extend(b)
return tmp
def q_url(question):
return BASE_URL + question.a['href']
def fetch_list(page_num):
r = requests.get(URL.format(page_num))
parsed = BeautifulSoup(r.text, 'html.parser')
summary = parsed.find_all('div', 'question-summary')
return map(lambda q: q_url(q), summary)
def fetch_question_list(number=20):
pages = int(ceil(number/15.0))
qli = map(lambda p: fetch_list(p), range(1, pages+1))
qli = reduce(lambda a, b: _concat(a, b), qli)
return qli[0:number]
def get_question_body(url):
r = requests.get(url)
return (url, r.text)
def extract_question(url, url_text):
bs = BeautifulSoup(url_text, 'html.parser')
qbody = bs.find_all(id="question")[0]
title = bs.find_all(id="question-header")[0].text.strip('\n')
tags = map(lambda t: t.text, qbody.find_all("a", "post-tag"))
body = qbody.find_all('div', 'post-text')[0].text
vote_body = qbody.find_all("div", "vote")[0]
vote_count = int(vote_body.span.text)
fav_count = int(vote_body.div.text) if vote_body.div.text else 0
try:
ans = bs.find_all(id="answers-header")[0].h2.text
ans = int(ans.strip().split()[0])
except:
ans = 0
return dict(
title=title,
url=url,
tags=tags,
vote_count=vote_count,
star_count=fav_count,
body=body,
answer_count = ans
)
if __name__ == "__main__":
try:
no_urls = int(sys.argv[1])
except:
print "Usage: scriptname no_of_urls"
sys.exit(-1)
url_list = fetch_question_list(no_urls)
pool = Pool(20)
url_data = pool.map(get_question_body, url_list)
details = map(lambda body: extract_question(*body), url_data)
print details
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment