Skip to content

Instantly share code, notes, and snippets.

@laispace
Last active October 22, 2015 10:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laispace/31d60843f60614026743 to your computer and use it in GitHub Desktop.
Save laispace/31d60843f60614026743 to your computer and use it in GitHub Desktop.
统计 alloyteam.com 每月博文阅读数量
__author__ = 'alloyteam.com'
# -*- coding: UTF-8 -*-
import urllib.request
import re
class Spider:
def __init__ (self, pageStartNum, pageEndNum, year, monthStart, monthEnd, sortOutput) :
self.baseUrl = 'http://alloyteam.com/page/'
self.pageStartNum = pageStartNum
self.pageEndNum = pageEndNum
self.year = year
self.monthStart = monthStart
self.monthEnd = monthEnd
self.sortOutput = sortOutput
def getPage (self, pageNum):
url = self.baseUrl + str(pageNum)
req = urllib.request.Request(url)
res = urllib.request.urlopen(req)
page = res.read().decode('utf8')
code = res.getcode()
print(url, code)
return page
def getPost (self, pageNum):
page = self.getPage(pageNum)
reStr1 = '<a href="(.*?)".*?class="blogTitle".*?>(.*?)</a>'
reStr2 = '.*?<div class="blogPs">.*?on (.*?) by <a href="(.*?)".*?rel="author">(.*?)</a>.*?view: (.*?) </div>'
pattern = re.compile(reStr1+reStr2, re.S)
items = re.findall(pattern, page)
list = []
for index, item in enumerate(items):
post = (
item[0],
item[1],
item[2],
item[3],
item[4],
item[5]
)
list.append(post)
return list
def getPosts (self):
list = []
for i in range(self.pageStartNum, self.pageEndNum + 1):
_list = self.getPost(i)
list += _list
return list
pageStartNum = 1
pageEndNum = 10
year = 2015
monthStart = 6
monthEnd = 11
sortOutput = True
spider = Spider(pageStartNum, pageEndNum, year, monthStart, monthEnd, sortOutput)
posts = spider.getPosts()
for month in range(monthStart, monthEnd):
list = [];
year = str(year)
month = str(month)
if (len(month) == 1):
month = '0' + month
yearMonthStr = year + '年' + month + '月'
for post in posts:
if (re.search(yearMonthStr, post[2])):
list.append(post)
list = sorted(list, key=lambda item: int(item[5].replace(',', '')), reverse=True)
formatStr = '{:<5} {:<10} {:<15} {:<15} {:<50}'
print(year + '年' + month + '月写的文章有:')
print(formatStr.format('排名', '阅读量', '时间', '作者', '标题'))
for index, post in enumerate(list):
print(formatStr.format(index+1, post[5], post[2], post[4],post[1]))
@laispace
Copy link
Author

执行:

$ python3 count-alloy-posts-per-month.py

输出:

2015年10月写的文章有:
排名    阅读量        时间              作者              标题
0     1,090      2015年10月12日     TAT.heyli       【AlloyTeam优化系列】Node直出让你的网页秒开
1     745        2015年10月12日     TAT.heyli       【AlloyTeam优化系列】纯前端优化首屏时间
2     629        2015年10月12日     TAT.heyli       【AlloyTeam优化系列】构建篇
3     612        2015年10月13日     TAT.mandyluo    angular应用如何实现按需加载
4     291        2015年10月19日     TAT.ronnie      【转向Javascript系列】从setTimeout说事件循环模型

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment