Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active August 29, 2015 14:26
Show Gist options
  • Save ikegami-yukino/c5c6d0d2c4e9bb5ee02c to your computer and use it in GitHub Desktop.
Save ikegami-yukino/c5c6d0d2c4e9bb5ee02c to your computer and use it in GitHub Desktop.
意識の低い単純クローラー
# -*- coding: utf-8 -*-
import os
import re
from encodings.aliases import aliases
import nkf
import tornado
from tornado import httpclient, gen
UA = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/31.0.1650.57 Safari/537.36')
httpclient.AsyncHTTPClient.configure(None, max_clients=1000)
all_encodings = set(aliases.values()) | set(aliases.keys())
header_encoding_pattern = re.compile('charset=([\w\-0-9]+)', re.I)
meta_encoding_pattern = re.compile(b'<meta [^>]*charset="?([^">\s]+)', re.I)
class SimpleCrawler(object):
def __init__(self, urls):
self.urls = urls
def extract_encoding_by_request(self, headers, body):
encoding = None
content_type = headers.get('Content-Type')
if content_type:
m = header_encoding_pattern.search(content_type)
if m:
encoding = m.group(1)
if not encoding:
m = meta_encoding_pattern.search(body)
if m:
encoding = m.group(1)
return encoding
def normalize_encoding(self, encoding):
encoding = encoding.lower()
if encoding in ('windows-31j', 'shift-jis', 'shift_jis', 'x-sjis', 'sjis'):
return 'cp932'
return encoding
def decode(self, headers, body):
encoding = self.extract_encoding_by_request(headers, body)
if not encoding or encoding.upper() == 'ISO-8859-1':
encoding = nkf.guess(body)
if encoding in ('BINARY', 'ISO-8859-1'):
encoding = 'utf8'
encoding = self.normalize_encoding(encoding)
if encoding not in all_encodings:
return nkf.nkf('-w', body).decode('utf8')
return body.decode(encoding, 'replace')
@gen.coroutine
def store_data(self):
client = httpclient.AsyncHTTPClient()
for (i, url) in enumerate(self.urls):
if i % 500 == 0:
print(i, url)
try:
request = httpclient.HTTPRequest(url, follow_redirects=True, user_agent=UA,
validate_cert=False, allow_ipv6=False)
response = yield client.fetch(request)
url = url.replace("http://", "").replace("https://", "")
filename = url.replace("/", " ").rstrip()
filename = os.path.join("pages", filename)
with open(filename, "w") as fd:
body = self.decode(response.headers, response.body)
fd.write(body)
except Exception as e:
print('%s %s %s' % (type(e), e, url))
continue
def run(self):
tornado.ioloop.IOLoop.current().run_sync(self.store_data)
if __name__ == "__main__":
import sys
with open(sys.argv[1]) as fd:
urls = fd.read().splitlines()
crawler = SimpleCrawler(urls)
crawler.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment