Last active
December 30, 2015 14:09
-
-
Save romichi/7840088 to your computer and use it in GitHub Desktop.
2chのクローラ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import json | |
import codecs | |
from collections import defaultdict | |
import urllib.request | |
class Crawler(): | |
def __init__(self): | |
pass | |
def makeSubjectDict(self, url): | |
u""" | |
subject.txtから{dat名:(スレ名, 書き込み数)}の辞書を作成 | |
""" | |
subjectDict = {} | |
for bytesLine in urllib.request.urlopen(url + "/subject.txt"): | |
line = bytesLine.decode("cp932", "replace") | |
dat, titleAndNum = line.lstrip().split("<>", 1) | |
title, num = titleAndNum.rsplit(" ", 1) | |
num = num.strip("\n()") | |
subjectDict[dat] = (title, num) | |
return subjectDict | |
def loadDatFile(self, url, dat): | |
u""" | |
datファイルを取得 | |
""" | |
return urllib.request.urlopen(url + "/dat/" + dat) | |
class Log(): | |
def __init__(self, fileName = None): | |
self.logDic = defaultdict() # datの取得状況のログ{板名:{スレ名:書き込み数}, ...} | |
if fileName: | |
self.loadLog(fileName) | |
def loadLog(self, fileName): | |
u""" | |
取得状況のログをセットする | |
""" | |
with codecs.open(fileName, "r", "utf-8") as f: | |
self.logDic = json.load(f) | |
def saveLog(self, fileName): | |
u""" | |
logDicを保存する | |
""" | |
with codecs.open(fileName, "w", "utf-8") as f: | |
json.dump(self.logDic, f, ensure_ascii=False) | |
def updateLog(self, thread, dat, num): | |
u""" | |
logDicを更新する | |
""" | |
if thread not in self.logDic: | |
self.logDic[thread] = {} | |
self.logDic[thread].update({dat:num}) | |
def hasDiff(self, thread, dat, num): | |
u""" | |
threadのdatが以前から更新されているか | |
""" | |
if thread in self.logDic and dat in self.logDic[thread]: | |
if self.logDic[thread][dat] == num: | |
return False | |
return True | |
if __name__ == "__main__": | |
import os | |
import time | |
datDir = "dat" | |
crawler = Crawler() | |
log = Log("crawlLog.log") | |
urlList = [("エロゲネタ", "http://pele.bbspink.com/erog"), | |
("エロゲー", "http://kilauea.bbspink.com/hgame"), | |
("エロゲー作品別", "http://kilauea.bbspink.com/hgame2")] | |
for ita, url in urlList: | |
if not os.path.exists(os.path.join(datDir, ita)): | |
os.mkdir(os.path.join(datDir, ita)) | |
subjectDict = crawler.makeSubjectDict(url) | |
for dat, (title, num) in subjectDict.items(): | |
print(ita, title, dat, num, end=' ') | |
if log.hasDiff(ita, dat, num): | |
datFile = crawler.loadDatFile(url, dat) | |
with open(os.path.join(datDir, ita, dat), "bw") as f: | |
f.write(datFile.read()) | |
log.updateLog(ita, dat, num) | |
log.saveLog("crawlLog.log") | |
print("update") | |
time.sleep(3) | |
else: | |
print("not diff") | |
print("finish") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment