Created
November 12, 2008 08:40
-
-
Save koyachi/24110 to your computer and use it in GitHub Desktop.
SimpleCrawler.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8 -*- | |
require 'rubygems' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'extlib' | |
require 'net/http' | |
require 'uri' | |
require 'tmpdir' | |
require 'digest/md5' | |
# memo | |
# Crawler | |
# Simple | |
# Fast | |
# Process | |
# Feature | |
# URLPool | |
# Util(=>Nokogiri?) | |
# Storage(ユーザ側で勝手にしてもらうほうがいいかも) | |
# TODO, IDEA | |
# v register_process | |
# v 明示的な継承した時点でregister_processする? require時とか -> inherited | |
# v via xxx(NotifyURLsVIA(Processer)) | |
# v Util#download | |
# v 実行フォルダ/tmp/yyyymmdd_hhmmss/ -> 指定フォルダ | |
# - url cache | |
# - ブラウザから右クリックでxpath通知 | |
# - 標準添付processerを別ファイルに | |
module SimpleCrawler | |
class Processer; end | |
# url + title + via表示 | |
class NotifyURLsVIA < SimpleCrawler::Processer | |
def match(url) | |
true | |
end | |
def process(url, content, via) | |
title = (content.nil?) ? '' : nokogiri(content).xpath('//title')[0].inner_html.strip | |
log <<PRINT | |
title: #{title} | |
url: #{url} | |
via: #{via[:url]} | |
#{via[:info]} | |
# download url | |
end | |
end | |
# module Donwloader | |
# def process(url, content, via) | |
# download url | |
# end | |
# end | |
# class ImageDownloader | |
# include Downloader | |
# end | |
# class MusicDownloader | |
# include Downloader | |
# end | |
# class VideoDownloader | |
# include Downloader | |
# end | |
module Music | |
class MySpace | |
end | |
class EightTracks | |
def match(url) | |
url =~ %r!http://8trakcs.com/.*?/.*! | |
end | |
def process(url, content, via) | |
end | |
end | |
class MP3Blogs | |
end | |
end | |
module Video | |
class Youtube | |
end | |
class Niconico | |
end | |
end | |
module FileHost | |
class ZShare | |
end | |
class Senduit | |
end | |
end | |
end | |
module SimpleCrawler | |
class << self | |
cattr_accessor :work_dir | |
self.work_dir = Dir.tmpdir + '/simple_crawler' | |
Dir.mkdir(self.work_dir) unless File.directory? self.work_dir | |
end | |
module URLPool | |
def set_crawler(c) | |
@crawler = c | |
end | |
def push_url(*url) | |
@crawler.url_queue.push(*url) | |
end | |
end | |
module Util | |
def download(url, savedir='', type='blob') | |
ext = (type != 'blob') ? ".#{type}" : File.extname(url) or '' | |
filename = ((savedir == '') ? SimpleCrawler.work_dir : dir) + "/#{Digest::MD5.hexdigest(url)}#{ext}" | |
print "download #{url} to #{filename}\n" | |
content = open(url).read | |
File.open(filename, "w") {|f| | |
f.write content | |
} | |
[filename, content] | |
end | |
def nokogiri(content) | |
Nokogiri::HTML(content) | |
end | |
end | |
class Processer | |
def initialize | |
end | |
def setup | |
end | |
def match(url) | |
false | |
end | |
def process(url, content, via) | |
end | |
def self.inherited(subclass) | |
SimpleCrawler.register_process subclass | |
end | |
def log(msg) | |
end | |
end | |
class << self | |
cattr_accessor :processers, :url_queue | |
self.processers, self.url_queue = [], [] | |
def register_process(user_class=Processer, force=false) | |
def mixin(cls, force) | |
return if !force && %w[NotifyURLsVIA].any?{|c| c == cls.name} | |
cls.class_eval <<-MIXIN | |
include SimpleCrawler::URLPool | |
include SimpleCrawler::Util | |
def initialize(crawler) | |
setup() | |
set_crawler(crawler) | |
end | |
MIXIN | |
self.processers << cls.new(self) | |
end | |
if user_class.instance_of? Array then | |
user_class.each do |cls| | |
mixin(cls, force) | |
end | |
else | |
mixin(user_class, force) | |
end | |
end | |
def run(url) | |
self.url_queue = (url.instance_of? String) ? [url, {:url => 'root', :info => ''}] : | |
(url.instance_of? Array) ? url.map {|u| [u, {:url => 'root', :info => ''}]} : [] | |
self.url_queue.each do |url, via| | |
print "%-10s %s\n" % ["pop", url] | |
self.processers.each do |p| | |
next unless p.match(url) | |
print "%-10s %s\n" % ["process", p.class.name] | |
uri_parts = URI.split(url) | |
host, port, path = uri_parts[2], uri_parts[3], | |
(uri_parts[5] == '') ? '/index.html' : uri_parts[5] | |
begin | |
Net::HTTP.start(host, port) {|http| | |
response = http.head(path) | |
if response['content-type'] =~ %r!text/html! then | |
response = http.get(path).body | |
else | |
response = nil | |
end | |
result = p.process(url, response, via) | |
sleep 1 | |
} | |
rescue Timeout::Error | |
print "timeout\n" | |
end | |
break | |
end | |
print "\n" | |
end | |
end | |
def output_processer_log(user_class) | |
def mixin(cls) | |
cls.class_eval <<-MIXIN | |
def log(msg) | |
print msg + "\n" | |
end | |
MIXIN | |
end | |
if user_class.instance_of? Array then | |
user_class.each do |cls| | |
mixin(cls) | |
end | |
else | |
mixin(user_class) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment