Skip to content

Instantly share code, notes, and snippets.

@jjgod
Created March 18, 2009 10:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjgod/81040 to your computer and use it in GitHub Desktop.
Save jjgod/81040 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import ConfigParser, urllib, codecs, os, time
config = ConfigParser.ConfigParser()
config.read('ias.cfg')
board_url = "http://www.newsmth.net/bbsdoc.php?board=%s"
board_page_url = "http://www.newsmth.net/bbsdoc.php?board=%s&&page=%d"
post_url = "http://www.newsmth.net/bbscon.php?bid=%d&id=%d"
GBKReader = codecs.getreader("gbk")
checked_urls = [ ]
attach_writer_str = "attWriter("
attach_start_str = ");attach('"
attachment_url = "http://att.newsmth.net/att.php?%s.%d.%d.%d%s"
class Post:
id = 0
title = None
board_name = None
author = None
board_id = 0
def check(self):
url = post_url % (self.board_id, self.id)
if url in checked_urls:
return
checked_urls.append(url)
post_file = urllib.urlopen(url)
post_fd = GBKReader(post_file)
for line in post_fd:
start = line.find(attach_writer_str)
if start > -1:
start += len(attach_writer_str)
end = line.find(");", start)
cols = line[start : end].split(",")
continue
start = line.find(attach_start_str)
if start > -1:
start += len(attach_start_str)
end = line.find(");", start)
if end > -1:
cols = line[start : end].split(",")
fname = cols[0].rstrip("'")
ext = os.path.splitext(fname)[1]
size = int(cols[1])
if size > 51200:
type = "p"
else:
type = "s"
url = attachment_url % (type, self.board_id, self.id, int(cols[2]), ext)
local_path = "%s/%s" % (self.board_name, fname)
if not os.path.isfile(local_path):
cmd = ("wget -nv -O %s %s" % (local_path, url)).encode("utf-8")
os.system(cmd)
post_file.close()
def __str__(self):
return "[%d] [%s] %s (%d)" % (self.board_id, self.author, self.title, self.id)
def check_page(board, page = None):
if page:
url = board_page_url % (board, page)
else:
url = board_url % board
# print url
board_file = urllib.urlopen(url)
board_fd = GBKReader(board_file)
first = True
for line in board_fd:
line = line.strip()
if line.startswith("var c"):
cols = line.split(",")
pageNum = int(cols[5])
boardId = int(cols[1])
if line.startswith("c.o("):
cols = line[4:].split(",")
# 忽略置底文章
if 'd' in cols[3]:
continue
pubTime = int(cols[4])
if first:
firstTime = pubTime
first = False
if '@' in cols[3]:
post = Post()
post.board_name = board
post.board_id = boardId
post.title = cols[5].strip('\'').strip()
post.id = int(cols[0])
post.author = cols[2].strip('\'')
post.check()
lastTime = pubTime
board_file.close()
return [firstTime, lastTime, pageNum]
def check_board(board):
lastCheck = config.getint(board, "lastcheck")
if not lastCheck:
print "Failed to find board info"
return
if not os.path.isdir(board):
os.mkdir(board)
[firstTime, lastTime, pageNum] = check_page(board)
updatedLastCheck = lastTime
while firstTime > lastCheck and pageNum > 1:
[firstTime, lastTime, pageNum] = check_page(board, pageNum - 1)
print "[%s] Last Post Checked: %s" % (board, time.asctime(time.localtime(updatedLastCheck)))
config.set(board, "lastcheck", "%d" % updatedLastCheck)
def main():
boards = config.get("basic", "boards")
if not boards:
print "Failed to parse boards"
boards = [ board.strip() for board in boards.split(",") ]
if len(boards) == 0:
print "Failed to split boards"
for board in boards:
check_board(board)
configfile = open('ias.cfg', 'wb')
config.write(configfile)
configfile.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment