Created
March 18, 2009 10:20
-
-
Save jjgod/81040 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import ConfigParser, urllib, codecs, os, time | |
config = ConfigParser.ConfigParser() | |
config.read('ias.cfg') | |
board_url = "http://www.newsmth.net/bbsdoc.php?board=%s" | |
board_page_url = "http://www.newsmth.net/bbsdoc.php?board=%s&&page=%d" | |
post_url = "http://www.newsmth.net/bbscon.php?bid=%d&id=%d" | |
GBKReader = codecs.getreader("gbk") | |
checked_urls = [ ] | |
attach_writer_str = "attWriter(" | |
attach_start_str = ");attach('" | |
attachment_url = "http://att.newsmth.net/att.php?%s.%d.%d.%d%s" | |
class Post: | |
id = 0 | |
title = None | |
board_name = None | |
author = None | |
board_id = 0 | |
def check(self): | |
url = post_url % (self.board_id, self.id) | |
if url in checked_urls: | |
return | |
checked_urls.append(url) | |
post_file = urllib.urlopen(url) | |
post_fd = GBKReader(post_file) | |
for line in post_fd: | |
start = line.find(attach_writer_str) | |
if start > -1: | |
start += len(attach_writer_str) | |
end = line.find(");", start) | |
cols = line[start : end].split(",") | |
continue | |
start = line.find(attach_start_str) | |
if start > -1: | |
start += len(attach_start_str) | |
end = line.find(");", start) | |
if end > -1: | |
cols = line[start : end].split(",") | |
fname = cols[0].rstrip("'") | |
ext = os.path.splitext(fname)[1] | |
size = int(cols[1]) | |
if size > 51200: | |
type = "p" | |
else: | |
type = "s" | |
url = attachment_url % (type, self.board_id, self.id, int(cols[2]), ext) | |
local_path = "%s/%s" % (self.board_name, fname) | |
if not os.path.isfile(local_path): | |
cmd = ("wget -nv -O %s %s" % (local_path, url)).encode("utf-8") | |
os.system(cmd) | |
post_file.close() | |
def __str__(self): | |
return "[%d] [%s] %s (%d)" % (self.board_id, self.author, self.title, self.id) | |
def check_page(board, page = None): | |
if page: | |
url = board_page_url % (board, page) | |
else: | |
url = board_url % board | |
# print url | |
board_file = urllib.urlopen(url) | |
board_fd = GBKReader(board_file) | |
first = True | |
for line in board_fd: | |
line = line.strip() | |
if line.startswith("var c"): | |
cols = line.split(",") | |
pageNum = int(cols[5]) | |
boardId = int(cols[1]) | |
if line.startswith("c.o("): | |
cols = line[4:].split(",") | |
# 忽略置底文章 | |
if 'd' in cols[3]: | |
continue | |
pubTime = int(cols[4]) | |
if first: | |
firstTime = pubTime | |
first = False | |
if '@' in cols[3]: | |
post = Post() | |
post.board_name = board | |
post.board_id = boardId | |
post.title = cols[5].strip('\'').strip() | |
post.id = int(cols[0]) | |
post.author = cols[2].strip('\'') | |
post.check() | |
lastTime = pubTime | |
board_file.close() | |
return [firstTime, lastTime, pageNum] | |
def check_board(board): | |
lastCheck = config.getint(board, "lastcheck") | |
if not lastCheck: | |
print "Failed to find board info" | |
return | |
if not os.path.isdir(board): | |
os.mkdir(board) | |
[firstTime, lastTime, pageNum] = check_page(board) | |
updatedLastCheck = lastTime | |
while firstTime > lastCheck and pageNum > 1: | |
[firstTime, lastTime, pageNum] = check_page(board, pageNum - 1) | |
print "[%s] Last Post Checked: %s" % (board, time.asctime(time.localtime(updatedLastCheck))) | |
config.set(board, "lastcheck", "%d" % updatedLastCheck) | |
def main(): | |
boards = config.get("basic", "boards") | |
if not boards: | |
print "Failed to parse boards" | |
boards = [ board.strip() for board in boards.split(",") ] | |
if len(boards) == 0: | |
print "Failed to split boards" | |
for board in boards: | |
check_board(board) | |
configfile = open('ias.cfg', 'wb') | |
config.write(configfile) | |
configfile.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment