Skip to content

Instantly share code, notes, and snippets.

@romichi
Last active December 30, 2015 14:09
Show Gist options
  • Save romichi/7840088 to your computer and use it in GitHub Desktop.
Save romichi/7840088 to your computer and use it in GitHub Desktop.
2chのクローラ
# -*- coding:utf-8 -*-
import json
import codecs
from collections import defaultdict
import urllib.request
class Crawler():
def __init__(self):
pass
def makeSubjectDict(self, url):
u"""
subject.txtから{dat名:(スレ名, 書き込み数)}の辞書を作成
"""
subjectDict = {}
for bytesLine in urllib.request.urlopen(url + "/subject.txt"):
line = bytesLine.decode("cp932", "replace")
dat, titleAndNum = line.lstrip().split("<>", 1)
title, num = titleAndNum.rsplit(" ", 1)
num = num.strip("\n()")
subjectDict[dat] = (title, num)
return subjectDict
def loadDatFile(self, url, dat):
u"""
datファイルを取得
"""
return urllib.request.urlopen(url + "/dat/" + dat)
class Log():
def __init__(self, fileName = None):
self.logDic = defaultdict() # datの取得状況のログ{板名:{スレ名:書き込み数}, ...}
if fileName:
self.loadLog(fileName)
def loadLog(self, fileName):
u"""
取得状況のログをセットする
"""
with codecs.open(fileName, "r", "utf-8") as f:
self.logDic = json.load(f)
def saveLog(self, fileName):
u"""
logDicを保存する
"""
with codecs.open(fileName, "w", "utf-8") as f:
json.dump(self.logDic, f, ensure_ascii=False)
def updateLog(self, thread, dat, num):
u"""
logDicを更新する
"""
if thread not in self.logDic:
self.logDic[thread] = {}
self.logDic[thread].update({dat:num})
def hasDiff(self, thread, dat, num):
u"""
threadのdatが以前から更新されているか
"""
if thread in self.logDic and dat in self.logDic[thread]:
if self.logDic[thread][dat] == num:
return False
return True
if __name__ == "__main__":
import os
import time
datDir = "dat"
crawler = Crawler()
log = Log("crawlLog.log")
urlList = [("エロゲネタ", "http://pele.bbspink.com/erog"),
("エロゲー", "http://kilauea.bbspink.com/hgame"),
("エロゲー作品別", "http://kilauea.bbspink.com/hgame2")]
for ita, url in urlList:
if not os.path.exists(os.path.join(datDir, ita)):
os.mkdir(os.path.join(datDir, ita))
subjectDict = crawler.makeSubjectDict(url)
for dat, (title, num) in subjectDict.items():
print(ita, title, dat, num, end=' ')
if log.hasDiff(ita, dat, num):
datFile = crawler.loadDatFile(url, dat)
with open(os.path.join(datDir, ita, dat), "bw") as f:
f.write(datFile.read())
log.updateLog(ita, dat, num)
log.saveLog("crawlLog.log")
print("update")
time.sleep(3)
else:
print("not diff")
print("finish")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment