Skip to content

Instantly share code, notes, and snippets.

@junjiah
Last active December 22, 2015 06:39
Show Gist options
  • Save junjiah/6432502 to your computer and use it in GitHub Desktop.
Save junjiah/6432502 to your computer and use it in GitHub Desktop.
utilize daily.zhihu' api for info extraction and peewee for database management.
#coding:utf-8
import urllib2
import json
import peewee
from datetime import date, timedelta
from BeautifulSoup import BeautifulSoup
today = date.today()
one_day = timedelta(days=1)
date_i = date(2013,5,20)
daily_zhihu_before = 'http://news.at.zhihu.com/api/1.2/news/before/'
headers = {'Referer':'http://www.google.com', 'User-Agent':'Opera/9.99',}
myDB = peewee.MySQLDatabase("zhihu", host="127.0.0.1", user="edward")
class DailyZhihu(peewee.Model):
authors = peewee.CharField(max_length=100)
q_title = peewee.CharField(null=True)
a_title = peewee.CharField(max_length=100)
a_id = peewee.IntegerField()
img_url = peewee.CharField()
content = peewee.TextField()
class Meta:
database = myDB
DailyZhihu.create_table()
while date_i < today:
url = daily_zhihu_before + date_i.strftime("%Y%m%d")
req = urllib2.Request(url, None, headers)
res = json.load(urllib2.urlopen(req))
for i in res[u'news']:
article_title = i[u'title']
article_img = i[u'image']
article_id = i[u'id']
article_req = urllib2.Request(i[u'url'], None, headers)
article = json.load(urllib2.urlopen(article_req))
soup = BeautifulSoup(article[u'body'])
article_authors = " ".join([a.text.replace(u',', '') for a in
soup.findAll(attrs={'class':'author'})])
question = " ".join([a.text for a in
soup.findAll(attrs={'class':'question-title'})])
article_content = "\n".join([a.text for a in
soup.findAll(attrs={'class':'content'})])
store_article = DailyZhihu(authors=article_authors,
q_title=question,
a_title=article_title,
a_id=article_id,
img_url=article_img,
content=article_content)
store_article.save()
date_i += one_day
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment