ikegami-yukino/simple_crawler.py

## simple_crawler.py
# -*- coding: utf-8 -*-
import os
import re
from encodings.aliases import aliases

import nkf
import tornado
from tornado import httpclient, gen


UA = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
      'Chrome/31.0.1650.57 Safari/537.36')
httpclient.AsyncHTTPClient.configure(None, max_clients=1000)

all_encodings = set(aliases.values()) | set(aliases.keys())
header_encoding_pattern = re.compile('charset=([\w\-0-9]+)', re.I)
meta_encoding_pattern = re.compile(b'<meta [^>]*charset="?([^">\s]+)', re.I)


class SimpleCrawler(object):

    def __init__(self, urls):
        self.urls = urls

    def extract_encoding_by_request(self, headers, body):
        encoding = None
        content_type = headers.get('Content-Type')
        if content_type:
            m = header_encoding_pattern.search(content_type)
            if m:
                encoding = m.group(1)
        if not encoding:
            m = meta_encoding_pattern.search(body)
            if m:
                encoding = m.group(1)
        return encoding

    def normalize_encoding(self, encoding):
        encoding = encoding.lower()
        if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
            return 'cp932'
        return encoding

    def decode(self, headers, body):
        encoding = self.extract_encoding_by_request(headers, body)
        if not encoding or encoding.upper() == 'ISO-8859-1':
            encoding = nkf.guess(body)
            if encoding in ('BINARY', 'ISO-8859-1'):
                encoding = 'utf8'
        encoding = self.normalize_encoding(encoding)
        if encoding not in all_encodings:
            return nkf.nkf('-w', body).decode('utf8')
        return body.decode(encoding, 'replace')

    @gen.coroutine
    def store_data(self):
        client = httpclient.AsyncHTTPClient()
        for (i, url) in enumerate(self.urls):
            if i % 500 == 0:
                print(i, url)
            try:
                request = httpclient.HTTPRequest(url, follow_redirects=True, user_agent=UA,
                                                 validate_cert=False, allow_ipv6=False)
                response = yield client.fetch(request)
                url = url.replace("http://", "").replace("https://", "")
                filename = url.replace("/", " ").rstrip()
                filename = os.path.join("pages", filename)
                with open(filename, "w") as fd:
                    body = self.decode(response.headers, response.body)
                    fd.write(body)
            except Exception as e:
                print('%s %s %s' % (type(e), e, url))
                continue

    def run(self):
        tornado.ioloop.IOLoop.current().run_sync(self.store_data)


if __name__ == "__main__":
    import sys
    with open(sys.argv[1]) as fd:
        urls = fd.read().splitlines()
    crawler = SimpleCrawler(urls)
    crawler.run()
	# -- coding: utf-8 --
	import os
	import re
	from encodings.aliases import aliases

	import nkf
	import tornado
	from tornado import httpclient, gen


	UA = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/31.0.1650.57 Safari/537.36')
	httpclient.AsyncHTTPClient.configure(None, max_clients=1000)

	all_encodings = set(aliases.values()) \| set(aliases.keys())
	header_encoding_pattern = re.compile('charset=([\w\-0-9]+)', re.I)
	meta_encoding_pattern = re.compile(b'<meta [^>]*charset="?([^">\s]+)', re.I)


	class SimpleCrawler(object):

	def __init__(self, urls):
	self.urls = urls

	def extract_encoding_by_request(self, headers, body):
	encoding = None
	content_type = headers.get('Content-Type')
	if content_type:
	m = header_encoding_pattern.search(content_type)
	if m:
	encoding = m.group(1)
	if not encoding:
	m = meta_encoding_pattern.search(body)
	if m:
	encoding = m.group(1)
	return encoding

	def normalize_encoding(self, encoding):
	encoding = encoding.lower()
	if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
	return 'cp932'
	return encoding

	def decode(self, headers, body):
	encoding = self.extract_encoding_by_request(headers, body)
	if not encoding or encoding.upper() == 'ISO-8859-1':
	encoding = nkf.guess(body)
	if encoding in ('BINARY', 'ISO-8859-1'):
	encoding = 'utf8'
	encoding = self.normalize_encoding(encoding)
	if encoding not in all_encodings:
	return nkf.nkf('-w', body).decode('utf8')
	return body.decode(encoding, 'replace')

	@gen.coroutine
	def store_data(self):
	client = httpclient.AsyncHTTPClient()
	for (i, url) in enumerate(self.urls):
	if i % 500 == 0:
	print(i, url)
	try:
	request = httpclient.HTTPRequest(url, follow_redirects=True, user_agent=UA,
	validate_cert=False, allow_ipv6=False)
	response = yield client.fetch(request)
	url = url.replace("http://", "").replace("https://", "")
	filename = url.replace("/", " ").rstrip()
	filename = os.path.join("pages", filename)
	with open(filename, "w") as fd:
	body = self.decode(response.headers, response.body)
	fd.write(body)
	except Exception as e:
	print('%s %s %s' % (type(e), e, url))
	continue

	def run(self):
	tornado.ioloop.IOLoop.current().run_sync(self.store_data)


	if __name__ == "__main__":
	import sys
	with open(sys.argv[1]) as fd:
	urls = fd.read().splitlines()
	crawler = SimpleCrawler(urls)
	crawler.run()