Skip to content

Instantly share code, notes, and snippets.

@koyachi
Created November 12, 2008 08:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koyachi/24110 to your computer and use it in GitHub Desktop.
Save koyachi/24110 to your computer and use it in GitHub Desktop.
SimpleCrawler.rb
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
require 'rubygems'
require 'open-uri'
require 'nokogiri'
require 'extlib'
require 'net/http'
require 'uri'
require 'tmpdir'
require 'digest/md5'
# memo
# Crawler
# Simple
# Fast
# Process
# Feature
# URLPool
# Util(=>Nokogiri?)
# Storage(ユーザ側で勝手にしてもらうほうがいいかも)
# TODO, IDEA
# v register_process
# v 明示的な継承した時点でregister_processする? require時とか -> inherited
# v via xxx(NotifyURLsVIA(Processer))
# v Util#download
# v 実行フォルダ/tmp/yyyymmdd_hhmmss/ -> 指定フォルダ
# - url cache
# - ブラウザから右クリックでxpath通知
# - 標準添付processerを別ファイルに
module SimpleCrawler
class Processer; end
# url + title + via表示
class NotifyURLsVIA < SimpleCrawler::Processer
def match(url)
true
end
def process(url, content, via)
title = (content.nil?) ? '' : nokogiri(content).xpath('//title')[0].inner_html.strip
log <<PRINT
title: #{title}
url: #{url}
via: #{via[:url]}
#{via[:info]}
PRINT
# download url
end
end
# module Donwloader
# def process(url, content, via)
# download url
# end
# end
# class ImageDownloader
# include Downloader
# end
# class MusicDownloader
# include Downloader
# end
# class VideoDownloader
# include Downloader
# end
module Music
class MySpace
end
class EightTracks
def match(url)
url =~ %r!http://8trakcs.com/.*?/.*!
end
def process(url, content, via)
end
end
class MP3Blogs
end
end
module Video
class Youtube
end
class Niconico
end
end
module FileHost
class ZShare
end
class Senduit
end
end
end
module SimpleCrawler
class << self
cattr_accessor :work_dir
self.work_dir = Dir.tmpdir + '/simple_crawler'
Dir.mkdir(self.work_dir) unless File.directory? self.work_dir
end
module URLPool
def set_crawler(c)
@crawler = c
end
def push_url(*url)
@crawler.url_queue.push(*url)
end
end
module Util
def download(url, savedir='', type='blob')
ext = (type != 'blob') ? ".#{type}" : File.extname(url) or ''
filename = ((savedir == '') ? SimpleCrawler.work_dir : dir) + "/#{Digest::MD5.hexdigest(url)}#{ext}"
print "download #{url} to #{filename}\n"
content = open(url).read
File.open(filename, "w") {|f|
f.write content
}
[filename, content]
end
def nokogiri(content)
Nokogiri::HTML(content)
end
end
class Processer
def initialize
end
def setup
end
def match(url)
false
end
def process(url, content, via)
end
def self.inherited(subclass)
SimpleCrawler.register_process subclass
end
def log(msg)
end
end
class << self
cattr_accessor :processers, :url_queue
self.processers, self.url_queue = [], []
def register_process(user_class=Processer, force=false)
def mixin(cls, force)
return if !force && %w[NotifyURLsVIA].any?{|c| c == cls.name}
cls.class_eval <<-MIXIN
include SimpleCrawler::URLPool
include SimpleCrawler::Util
def initialize(crawler)
setup()
set_crawler(crawler)
end
MIXIN
self.processers << cls.new(self)
end
if user_class.instance_of? Array then
user_class.each do |cls|
mixin(cls, force)
end
else
mixin(user_class, force)
end
end
def run(url)
self.url_queue = (url.instance_of? String) ? [url, {:url => 'root', :info => ''}] :
(url.instance_of? Array) ? url.map {|u| [u, {:url => 'root', :info => ''}]} : []
self.url_queue.each do |url, via|
print "%-10s %s\n" % ["pop", url]
self.processers.each do |p|
next unless p.match(url)
print "%-10s %s\n" % ["process", p.class.name]
uri_parts = URI.split(url)
host, port, path = uri_parts[2], uri_parts[3],
(uri_parts[5] == '') ? '/index.html' : uri_parts[5]
begin
Net::HTTP.start(host, port) {|http|
response = http.head(path)
if response['content-type'] =~ %r!text/html! then
response = http.get(path).body
else
response = nil
end
result = p.process(url, response, via)
sleep 1
}
rescue Timeout::Error
print "timeout\n"
end
break
end
print "\n"
end
end
def output_processer_log(user_class)
def mixin(cls)
cls.class_eval <<-MIXIN
def log(msg)
print msg + "\n"
end
MIXIN
end
if user_class.instance_of? Array then
user_class.each do |cls|
mixin(cls)
end
else
mixin(user_class)
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment