Skip to content

Instantly share code, notes, and snippets.

@udonchan
Created May 3, 2010 12:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save udonchan/388033 to your computer and use it in GitHub Desktop.
Save udonchan/388033 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
require 'rubygems'
require 'MeCab'
require 'net/http'
require 'uri'
require 'extractcontent.rb'
$KCODE='u'
Net::HTTP.version_1_2
class TF
@tagger = nil
@extract_content = nil
def initialize
@tagger = MeCab::Tagger.new('-O wakati')
@extract_content = ExtractContent::Extractor.new({:decay_factor=>0.75})
end
def fetch(uri_str, limit = 10)
uri = URI.parse(URI.encode(uri_str))
raise ArgumentError, 'http redirect too deep' if limit == 0
response = nil
Net::HTTP.new(uri.host).start do |http|
response = http.get(uri.request_uri, {'user-agent' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.4) Gecko/20100413 Firefox/3.6.4'})
end
case response
when Net::HTTPSuccess then response
when Net::HTTPRedirection then fetch(response['Location'], limit - 1)
else
response.error!
end
end
protected :fetch
def mecab_node(context)
@tagger.parseToNode(context)
end
protected :mecab_node
def tf(html)
tf = Hash::new
n = mecab_node(@extract_content.analyse(fetch(html).body).first)
while n do
if /^名詞/ =~ n.feature
if tf.key?(n.surface)
tf[n.surface] = tf[n.surface] + 1
else
tf[n.surface] = 1
end
end
n = n.next
end
tf
end
end
TF::new.tf('http://ja.wikipedia.org/wiki/沢城みゆき').each do |k, v|
puts "#{k} : #{v}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment