mopemope/crawler.py

## crawler.py
# -*- coding: utf-8 -*-

import requests
from pyquery import PyQuery as pq
import parser
import re

import datastore

url_re = re.compile(".*/(\d+)/.*", re.M)

MENU_URL = "http://menu.2ch.net/bbsmenu.html"


def get_board_list():
    res = requests.get(MENU_URL)
    if res.status_code == 200:
        data = res.content
        u = data.decode("cp932", "ignore")
        return parser.parse_menu(u)
        # return parser.parse_menu(data)

def get_thread_list(board_nm, board_url):
    url = board_url + "subback.html"
    # print(url)
    res = requests.get(url)
    if res.status_code == 200:
        data = res.content
        u = data.decode("cp932", "ignore")
        return parser.parse_thread_list(board_url, u)


def get_thread_data(board_url, url, nm):
    m = url_re.search(url)
    bg20 = 'http://bg20.2ch.net/test/r.so/'
    if m:
        index = m.group(1)
        bgurl = "%s%s%s/" % (bg20, board_url[7:], index)

        res = requests.get(bgurl)
        if res.status_code == 200:
            data = res.content
            u = data.decode("cp932", "ignore")
            if u.find("ERROR = 5656") == -1:
                return parser.parse_thread(board_url, url, nm, u)

import concurrent.futures


def crawle_thread(board_url, name, url, rescount):
    # print(name)
    try:
        dats = get_thread_data(board_url, url, name)
        if dats:
            old_count = datastore.insert_thread(board_url, name, url, rescount)
            if old_count >= 0:
                dats = dats[old_count:]
                datastore.insert_dat(dats)
            else:
                print("譁ｰ逹縺ｪ縺 %s" % name)
        else:
            print("bg20 is dead. %s" % name)
    except:
        import traceback
        print(board_url)
        print(traceback.format_exc())

def exec_crawle(tls):
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executer:
        l = [executer.submit(crawle_thread, board_url, name, url, rescount) for board_url, name, url, rescount in tls]
        results = concurrent.futures.wait(l)
        for result in results.done:
            result.result()

def crawle(board_nm, board_url):
    print(board_url)
    tls = get_thread_list(board_nm, board_url)
    if tls:
        exec_crawle(tls)


def run():
    bl = get_board_list()
    datastore.insert_boards(bl)
    bl = datastore.get_boards()

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executer:
        l = [executer.submit(crawle, d["name"], d["url"]) for d in bl]
        results = concurrent.futures.wait(l)
        for result in results.done:
            result.result()


while True:
    run()
    import time
    time.sleep(60 * 15)


## datastore.py
# -*- coding: utf-8 -*-

from pymongo import Connection, ASCENDING
import hashlib

MAIN = 'main'
DAT = 'dat'

BOARD_LIST = 'board_list'
THREAD_LIST = 'thread_list'
THREAD = 'thread'

def get_connection(host='localhost', port=27017):
    return Connection(host, port)


def get_board_collection(conn=None):
    if not conn:
        conn = get_connection()
    db = conn[MAIN]
    return db[BOARD_LIST]

def get_boards():
    conn = get_connection()
    db = conn[MAIN]
    lst = db[BOARD_LIST]
    conn.close()
    return lst.find()

def get_thread_list_collection(conn=None):
    if not conn:
        conn = get_connection()
    db = conn[MAIN]
    return db[THREAD_LIST]

def get_thread_collection(conn=None):
    if not conn:
        conn = get_connection()

    db = conn[DAT]
    tl =  db[THREAD]
    return tl

def insert_boards(bl):
    c = get_connection()
    collection = get_board_collection(c)
    for nm, url in bl.items():
        collection.insert(dict(url=url, name=nm))
    c.close()

def insert_board(nm, url):
    c = get_connection()
    collection = get_board_collection(c)
    collection.insert(dict(url=url, name=nm))
    c.close()

def insert_thread(board_url, name, url, rescount):
    query = {"url" : url}
    c = get_connection()
    collection = get_thread_list_collection(c)
    r = collection.find_one(query)
    if not r:
        # New!
        old_count = 0
    else:
        old_count = r.get("rescount")
    try:
        if rescount > old_count:
            print("%s incoming %s" % (name, (rescount - old_count)))
            d = dict(board_url=board_url, name=name, url=url, rescount=rescount)
            query = {"url" : url}
            r = collection.update(query, {"$set": d}, upsert=True)
            return old_count
        else:
            return -1
    finally:
        c.close()

def insert_dat(dats):
    c = get_connection()
    collection = get_thread_collection(c)
    for dat in dats:
        r = collection.insert(dat)

    print("%s OK" % dat.get("thread_nm"))
    c.close()


def setup():
    c = get_connection()
    db = c[MAIN]
    bl = db[BOARD_LIST]
    bl.create_index("url", unique=True)
    tl = db[THREAD_LIST]
    r = tl.index_information()
    if not r:
        tl.create_index("board_url")
        tl.create_index("url")
        tl.create_index("name")

    db = c[DAT]
    tl =  db[THREAD]
    r = tl.index_information()
    if not r:
        tl.create_index("board_url")
        tl.create_index("url")
        tl.create_index("no")
        tl.create_index("comment")

    c.close()


def thread_nm_find(nm):
    import re
    c = get_connection()
    db = c[MAIN]
    tl = db[THREAD_LIST]
    res = tl.find({"name" : re.compile(nm)})
    return res

## parser.py
import codecs
import re
from pyquery import PyQuery as pq
from urllib.parse import urljoin

url_re = re.compile("(.*read.cgi/\w+/\d+/).*", re.M)
res_re = re.compile(".*\((\d+)\).*", re.M)
title_re = re.compile("\d+:\s+(.*)\((\d+)\)", re.M)

def parse_menu(data):
    result = dict()
    q = pq(data)
    for anchor in q("a"):
        a = pq(anchor)
        v = a.text()
        href = a.attr.href
        if not href.startswith('http'):
            continue
        if href.endswith("bbsmenu"):
            continue

        if not href.endswith("php") and not href.endswith(".net/") and not href.endswith(".jp/"):
            result[v] = href
    return result

def parse_thread_list(board_url, data):
    result = []
    base_url = None
    q = pq(data)
    for base in q("base"):
        base = pq(base)
        base_url = base.attr.href

    if not base_url:
        return

    for anchor in q("a"):
        a = pq(anchor)
        v = a.text()
        href = a.attr.href
        url = urljoin(base_url, href)
        if url.endswith("50"):
            # print(v, url)
            match = title_re.search(v)
            if match:
                title = match.group(1)
                res_cnt = match.group(2)
                result.append((board_url, title.strip(), url[:-3], int(res_cnt)))
        # else:
            # print("***** " + url)

    return result

def parse_thread(board_url, url, thread_nm, data):
    data = data.split("\n")
    i = 1
    result = []
    for res in data:
        cols = res.split("<>")
        if len(cols) > 4:
            d = dict()
            d["board_url"] = board_url
            d["url"] = url
            d["thread_nm"] = thread_nm
            d["hndl"] = cols[0]
            d["mailto"] = cols[1]
            d["date"] = cols[2]
            d["comment"] = cols[3]
            d["other"] = cols[4]
            d["no"] = i
            result.append(d)
            i += 1
    return result
	# -- coding: utf-8 --

	import requests
	from pyquery import PyQuery as pq
	import parser
	import re

	import datastore

	url_re = re.compile("./(\d+)/.", re.M)

	MENU_URL = "http://menu.2ch.net/bbsmenu.html"


	def get_board_list():
	res = requests.get(MENU_URL)
	if res.status_code == 200:
	data = res.content
	u = data.decode("cp932", "ignore")
	return parser.parse_menu(u)
	# return parser.parse_menu(data)

	def get_thread_list(board_nm, board_url):
	url = board_url + "subback.html"
	# print(url)
	res = requests.get(url)
	if res.status_code == 200:
	data = res.content
	u = data.decode("cp932", "ignore")
	return parser.parse_thread_list(board_url, u)


	def get_thread_data(board_url, url, nm):
	m = url_re.search(url)
	bg20 = 'http://bg20.2ch.net/test/r.so/'
	if m:
	index = m.group(1)
	bgurl = "%s%s%s/" % (bg20, board_url[7:], index)

	res = requests.get(bgurl)
	if res.status_code == 200:
	data = res.content
	u = data.decode("cp932", "ignore")
	if u.find("ERROR = 5656") == -1:
	return parser.parse_thread(board_url, url, nm, u)

	import concurrent.futures


	def crawle_thread(board_url, name, url, rescount):
	# print(name)
	try:
	dats = get_thread_data(board_url, url, name)
	if dats:
	old_count = datastore.insert_thread(board_url, name, url, rescount)
	if old_count >= 0:
	dats = dats[old_count:]
	datastore.insert_dat(dats)
	else:
	print("譁ｰ逹縺ｪ縺 %s" % name)
	else:
	print("bg20 is dead. %s" % name)
	except:
	import traceback
	print(board_url)
	print(traceback.format_exc())

	def exec_crawle(tls):
	with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executer:
	l = [executer.submit(crawle_thread, board_url, name, url, rescount) for board_url, name, url, rescount in tls]
	results = concurrent.futures.wait(l)
	for result in results.done:
	result.result()

	def crawle(board_nm, board_url):
	print(board_url)
	tls = get_thread_list(board_nm, board_url)
	if tls:
	exec_crawle(tls)


	def run():
	bl = get_board_list()
	datastore.insert_boards(bl)
	bl = datastore.get_boards()

	with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executer:
	l = [executer.submit(crawle, d["name"], d["url"]) for d in bl]
	results = concurrent.futures.wait(l)
	for result in results.done:
	result.result()


	while True:
	run()
	import time
	time.sleep(60 * 15)
	# -- coding: utf-8 --

	from pymongo import Connection, ASCENDING
	import hashlib

	MAIN = 'main'
	DAT = 'dat'

	BOARD_LIST = 'board_list'
	THREAD_LIST = 'thread_list'
	THREAD = 'thread'

	def get_connection(host='localhost', port=27017):
	return Connection(host, port)


	def get_board_collection(conn=None):
	if not conn:
	conn = get_connection()
	db = conn[MAIN]
	return db[BOARD_LIST]

	def get_boards():
	conn = get_connection()
	db = conn[MAIN]
	lst = db[BOARD_LIST]
	conn.close()
	return lst.find()

	def get_thread_list_collection(conn=None):
	if not conn:
	conn = get_connection()
	db = conn[MAIN]
	return db[THREAD_LIST]

	def get_thread_collection(conn=None):
	if not conn:
	conn = get_connection()

	db = conn[DAT]
	tl = db[THREAD]
	return tl

	def insert_boards(bl):
	c = get_connection()
	collection = get_board_collection(c)
	for nm, url in bl.items():
	collection.insert(dict(url=url, name=nm))
	c.close()

	def insert_board(nm, url):
	c = get_connection()
	collection = get_board_collection(c)
	collection.insert(dict(url=url, name=nm))
	c.close()

	def insert_thread(board_url, name, url, rescount):
	query = {"url" : url}
	c = get_connection()
	collection = get_thread_list_collection(c)
	r = collection.find_one(query)
	if not r:
	# New!
	old_count = 0
	else:
	old_count = r.get("rescount")
	try:
	if rescount > old_count:
	print("%s incoming %s" % (name, (rescount - old_count)))
	d = dict(board_url=board_url, name=name, url=url, rescount=rescount)
	query = {"url" : url}
	r = collection.update(query, {"$set": d}, upsert=True)
	return old_count
	else:
	return -1
	finally:
	c.close()

	def insert_dat(dats):
	c = get_connection()
	collection = get_thread_collection(c)
	for dat in dats:
	r = collection.insert(dat)

	print("%s OK" % dat.get("thread_nm"))
	c.close()



	def setup():
	c = get_connection()
	db = c[MAIN]
	bl = db[BOARD_LIST]
	bl.create_index("url", unique=True)
	tl = db[THREAD_LIST]
	r = tl.index_information()
	if not r:
	tl.create_index("board_url")
	tl.create_index("url")
	tl.create_index("name")

	db = c[DAT]
	tl = db[THREAD]
	r = tl.index_information()
	if not r:
	tl.create_index("board_url")
	tl.create_index("url")
	tl.create_index("no")
	tl.create_index("comment")

	c.close()


	def thread_nm_find(nm):
	import re
	c = get_connection()
	db = c[MAIN]
	tl = db[THREAD_LIST]
	res = tl.find({"name" : re.compile(nm)})
	return res
	import codecs
	import re
	from pyquery import PyQuery as pq
	from urllib.parse import urljoin

	url_re = re.compile("(.read.cgi/\w+/\d+/).", re.M)
	res_re = re.compile(".\((\d+)\).", re.M)
	title_re = re.compile("\d+:\s+(.*)\((\d+)\)", re.M)

	def parse_menu(data):
	result = dict()
	q = pq(data)
	for anchor in q("a"):
	a = pq(anchor)
	v = a.text()
	href = a.attr.href
	if not href.startswith('http'):
	continue
	if href.endswith("bbsmenu"):
	continue

	if not href.endswith("php") and not href.endswith(".net/") and not href.endswith(".jp/"):
	result[v] = href
	return result

	def parse_thread_list(board_url, data):
	result = []
	base_url = None
	q = pq(data)
	for base in q("base"):
	base = pq(base)
	base_url = base.attr.href

	if not base_url:
	return

	for anchor in q("a"):
	a = pq(anchor)
	v = a.text()
	href = a.attr.href
	url = urljoin(base_url, href)
	if url.endswith("50"):
	# print(v, url)
	match = title_re.search(v)
	if match:
	title = match.group(1)
	res_cnt = match.group(2)
	result.append((board_url, title.strip(), url[:-3], int(res_cnt)))
	# else:
	# print("***** " + url)

	return result

	def parse_thread(board_url, url, thread_nm, data):
	data = data.split("\n")
	i = 1
	result = []
	for res in data:
	cols = res.split("<>")
	if len(cols) > 4:
	d = dict()
	d["board_url"] = board_url
	d["url"] = url
	d["thread_nm"] = thread_nm
	d["hndl"] = cols[0]
	d["mailto"] = cols[1]
	d["date"] = cols[2]
	d["comment"] = cols[3]
	d["other"] = cols[4]
	d["no"] = i
	result.append(d)
	i += 1
	return result