Skip to content

Instantly share code, notes, and snippets.

@mopemope
Last active October 19, 2017 04:53
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mopemope/5464814 to your computer and use it in GitHub Desktop.
Save mopemope/5464814 to your computer and use it in GitHub Desktop.
2ch crawler prototype
# -*- coding: utf-8 -*-
import requests
from pyquery import PyQuery as pq
import parser
import re
import datastore
url_re = re.compile(".*/(\d+)/.*", re.M)
MENU_URL = "http://menu.2ch.net/bbsmenu.html"
def get_board_list():
res = requests.get(MENU_URL)
if res.status_code == 200:
data = res.content
u = data.decode("cp932", "ignore")
return parser.parse_menu(u)
# return parser.parse_menu(data)
def get_thread_list(board_nm, board_url):
url = board_url + "subback.html"
# print(url)
res = requests.get(url)
if res.status_code == 200:
data = res.content
u = data.decode("cp932", "ignore")
return parser.parse_thread_list(board_url, u)
def get_thread_data(board_url, url, nm):
m = url_re.search(url)
bg20 = 'http://bg20.2ch.net/test/r.so/'
if m:
index = m.group(1)
bgurl = "%s%s%s/" % (bg20, board_url[7:], index)
res = requests.get(bgurl)
if res.status_code == 200:
data = res.content
u = data.decode("cp932", "ignore")
if u.find("ERROR = 5656") == -1:
return parser.parse_thread(board_url, url, nm, u)
import concurrent.futures
def crawle_thread(board_url, name, url, rescount):
# print(name)
try:
dats = get_thread_data(board_url, url, name)
if dats:
old_count = datastore.insert_thread(board_url, name, url, rescount)
if old_count >= 0:
dats = dats[old_count:]
datastore.insert_dat(dats)
else:
print("譁ー逹縺ェ縺 %s" % name)
else:
print("bg20 is dead. %s" % name)
except:
import traceback
print(board_url)
print(traceback.format_exc())
def exec_crawle(tls):
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executer:
l = [executer.submit(crawle_thread, board_url, name, url, rescount) for board_url, name, url, rescount in tls]
results = concurrent.futures.wait(l)
for result in results.done:
result.result()
def crawle(board_nm, board_url):
print(board_url)
tls = get_thread_list(board_nm, board_url)
if tls:
exec_crawle(tls)
def run():
bl = get_board_list()
datastore.insert_boards(bl)
bl = datastore.get_boards()
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executer:
l = [executer.submit(crawle, d["name"], d["url"]) for d in bl]
results = concurrent.futures.wait(l)
for result in results.done:
result.result()
while True:
run()
import time
time.sleep(60 * 15)
# -*- coding: utf-8 -*-
from pymongo import Connection, ASCENDING
import hashlib
MAIN = 'main'
DAT = 'dat'
BOARD_LIST = 'board_list'
THREAD_LIST = 'thread_list'
THREAD = 'thread'
def get_connection(host='localhost', port=27017):
return Connection(host, port)
def get_board_collection(conn=None):
if not conn:
conn = get_connection()
db = conn[MAIN]
return db[BOARD_LIST]
def get_boards():
conn = get_connection()
db = conn[MAIN]
lst = db[BOARD_LIST]
conn.close()
return lst.find()
def get_thread_list_collection(conn=None):
if not conn:
conn = get_connection()
db = conn[MAIN]
return db[THREAD_LIST]
def get_thread_collection(conn=None):
if not conn:
conn = get_connection()
db = conn[DAT]
tl = db[THREAD]
return tl
def insert_boards(bl):
c = get_connection()
collection = get_board_collection(c)
for nm, url in bl.items():
collection.insert(dict(url=url, name=nm))
c.close()
def insert_board(nm, url):
c = get_connection()
collection = get_board_collection(c)
collection.insert(dict(url=url, name=nm))
c.close()
def insert_thread(board_url, name, url, rescount):
query = {"url" : url}
c = get_connection()
collection = get_thread_list_collection(c)
r = collection.find_one(query)
if not r:
# New!
old_count = 0
else:
old_count = r.get("rescount")
try:
if rescount > old_count:
print("%s incoming %s" % (name, (rescount - old_count)))
d = dict(board_url=board_url, name=name, url=url, rescount=rescount)
query = {"url" : url}
r = collection.update(query, {"$set": d}, upsert=True)
return old_count
else:
return -1
finally:
c.close()
def insert_dat(dats):
c = get_connection()
collection = get_thread_collection(c)
for dat in dats:
r = collection.insert(dat)
print("%s OK" % dat.get("thread_nm"))
c.close()
def setup():
c = get_connection()
db = c[MAIN]
bl = db[BOARD_LIST]
bl.create_index("url", unique=True)
tl = db[THREAD_LIST]
r = tl.index_information()
if not r:
tl.create_index("board_url")
tl.create_index("url")
tl.create_index("name")
db = c[DAT]
tl = db[THREAD]
r = tl.index_information()
if not r:
tl.create_index("board_url")
tl.create_index("url")
tl.create_index("no")
tl.create_index("comment")
c.close()
def thread_nm_find(nm):
import re
c = get_connection()
db = c[MAIN]
tl = db[THREAD_LIST]
res = tl.find({"name" : re.compile(nm)})
return res
import codecs
import re
from pyquery import PyQuery as pq
from urllib.parse import urljoin
url_re = re.compile("(.*read.cgi/\w+/\d+/).*", re.M)
res_re = re.compile(".*\((\d+)\).*", re.M)
title_re = re.compile("\d+:\s+(.*)\((\d+)\)", re.M)
def parse_menu(data):
result = dict()
q = pq(data)
for anchor in q("a"):
a = pq(anchor)
v = a.text()
href = a.attr.href
if not href.startswith('http'):
continue
if href.endswith("bbsmenu"):
continue
if not href.endswith("php") and not href.endswith(".net/") and not href.endswith(".jp/"):
result[v] = href
return result
def parse_thread_list(board_url, data):
result = []
base_url = None
q = pq(data)
for base in q("base"):
base = pq(base)
base_url = base.attr.href
if not base_url:
return
for anchor in q("a"):
a = pq(anchor)
v = a.text()
href = a.attr.href
url = urljoin(base_url, href)
if url.endswith("50"):
# print(v, url)
match = title_re.search(v)
if match:
title = match.group(1)
res_cnt = match.group(2)
result.append((board_url, title.strip(), url[:-3], int(res_cnt)))
# else:
# print("***** " + url)
return result
def parse_thread(board_url, url, thread_nm, data):
data = data.split("\n")
i = 1
result = []
for res in data:
cols = res.split("<>")
if len(cols) > 4:
d = dict()
d["board_url"] = board_url
d["url"] = url
d["thread_nm"] = thread_nm
d["hndl"] = cols[0]
d["mailto"] = cols[1]
d["date"] = cols[2]
d["comment"] = cols[3]
d["other"] = cols[4]
d["no"] = i
result.append(d)
i += 1
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment