Skip to content

Instantly share code, notes, and snippets.

@yydai
Created July 9, 2017 08:43
Show Gist options
  • Save yydai/bb82754da20ff26b5b1716ed7a64ec0d to your computer and use it in GitHub Desktop.
Save yydai/bb82754da20ff26b5b1716ed7a64ec0d to your computer and use it in GitHub Desktop.
#coding:utf-8
import re
import requests
from bs4 import BeautifulSoup
def join_str(contents):
strings = []
for content in contents:
if isinstance(content, basestring):
strings.append(content)
return strings
# fields
"""
Questin:
title
ask_time
username
Answer:
username
post_time
content
"""
def get_content(id):
result = {}
question = {}
url = 'http://zhidao.baidu.com/question/{}.html'.format(id)
res = requests.get(url)
if not res.status_code == 200:
return None
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
if soup.title.string == u'\u767e\u5ea6\u77e5\u9053 - \u4fe1\u606f\u63d0\u793a':
return None
question['title'] = soup.title.string
asktimes = soup.find_all('span', attrs={"class": "grid-r ask-time"})
for i in asktimes:
asktime = i.contents[-1]
question['asktime'] = asktime.strip('\n')
res = soup.find_all('a', class_='user-name')
ask_name = None
for r in res:
attrs = r.attrs
if attrs.get('alog-action') == 'qb-ask-uname':
ask_name = r.string
question['ask_name'] = ask_name
break
else:
question['ask_name'] = 'anonymous'
answer_list = []
res = soup.find_all('div', id='wgt-answers')
if res:
res = res[0]
ans = res.find_all('div', id=re.compile('answer-\d+'))
answer_obj = {}
for every_ans in ans:
post_time = every_ans.find('span').contents[-1]
username = every_ans.find('a', class_='user-name')
answer_obj['username'] = username and username.string or "Enthusiastic user"
answer_obj['post_time'] = post_time.strip('\n')
answer_conent = every_ans.find('div', id=re.compile('answer-content-\d+'))
if answer_conent:
if answer_conent.span:
content = answer_conent.span.contents
elif answer_conent.p:
content = answer_conent.p.contents
elif answer_conent.pre:
content = answer_conent.pre.contents
content = join_str(content)
answer_obj['content'] = '\n'.join(content)
answer_list.append(answer_obj)
res = soup.find_all('div', id='wgt-quality')
if res:
res = res[0]
content = res.find('div', class_='quality-content-detail content')
quality_ans = content.contents
quality_ans_list = []
for content in quality_ans:
if isinstance(content, basestring):
quality_ans_list.append(content)
quality_ans = '\n'.join(quality_ans_list)
if quality_ans:
answer_list.append(quality_ans)
content_get = ['best-content', 'recommend-content']
for content_type in content_get:
res = soup.find_all('pre', id=re.compile(content_type))
if res:
res = res[0]
contents = []
for content in res.contents:
if isinstance(content, basestring):
contents.append(content)
recommand_content = '\n'.join(contents)
answer_list.append(recommand_content)
result = {'question': question, 'answer': answer_list}
return result
for i in xrange(10000, 10100):
result = get_content(i)
print result
if not result:
continue
print i, result.get('question')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment