Last active
August 29, 2015 13:56
-
-
Save piroyoung/8895007 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
#! ruby -EUTF-8 | |
# -*- mode:ruby; coding:utf-8 -*- | |
require 'MeCab' | |
require 'open-uri' | |
require 'nokogiri' | |
# スクレイピングするURL | |
url = [] | |
selector = [] | |
url << 'https://news.google.co.jp/' | |
selector << 'span.titletext' | |
url << 'http://hatenablog.com' | |
selector << 'a.entry-title' | |
url << 'http://news.yahoo.co.jp' | |
selector << 'body' | |
url << 'http://headline.2ch.net/bbynews/' | |
selector << 'a' | |
# 重複する数をカウントする | |
class Array | |
def count | |
k = Hash.new(0) | |
self.each{|x| k[x] += 1 } | |
return k | |
end | |
end | |
class AnalyzeHTML | |
attr_reader :text | |
def initialize(url,selector) | |
charset = nil | |
#opne-uriを使用.文字コード取得 | |
html = open(url) do |f| | |
@charset = f.charset | |
f.read | |
end | |
#nokogiriで対象サイトを読み込む | |
doc = Nokogiri::HTML.parse(html, nil, @charset) | |
@text = [] | |
# 各行からhtmlタグを取り覗いて文字列の配列(メンバ変数)に格納 | |
doc.css(selector).each do |txt| | |
@text << txt.to_s.gsub(/<("[^"]*"|'[^']*'|[^'">])*>/,'') | |
end | |
#形態素解析エンジンmecabのインスタンスを作成 | |
@m = MeCab::Tagger.new ("-Ochasen") | |
end | |
#各業を出力 | |
def listUp | |
@text.each do |line| | |
puts line | |
end | |
end | |
#形態素解析の結果を出力 | |
def listUpFeatures | |
@text.each do |line| | |
puts @m.parse(line) | |
end | |
end | |
#形態素解析の結果から名詞(Noun)を抜き出す | |
def listUpNoun | |
@text.each do |line| | |
#分析対象業の解析結果をノード形式で取得 | |
node = @m.parseToNode(line) | |
#node.stat == 3 : 文末のステータス | |
until node.stat == 3 do | |
node = node.next | |
#空白を取り除いてカンマで分割 第一要素には品詞が来る | |
feature = node.feature.to_s.gsub(/ /,'').split(",") | |
#品詞が名詞なら中身(node.surface)を出力 | |
if /名詞/u =~ feature[0].force_encoding('utf-8') | |
unless /数|記号|読点|括弧|接尾/u =~ feature[1].force_encoding('utf-8') | |
puts "#{node.surface}" | |
end | |
end | |
end | |
end | |
end | |
def countUpNoun(border) | |
@arrayNoun = [] | |
@text.each do |line| | |
#分析対象業の解析結果をノード形式で取得 | |
node = @m.parseToNode(line) | |
#node.stat == 3 : 文末のステータス | |
until node.stat == 3 do | |
node = node.next | |
#空白を取り除いてカンマで分割 第一要素には品詞が来る | |
feature = node.feature.to_s.gsub(/ /,'').split(",") | |
#品詞が名詞なら中身(node.surface)を出力 | |
if /名詞/u =~ feature[0].force_encoding('utf-8') | |
unless /数|記号|読点|括弧|接尾/u =~ feature[1].force_encoding('utf-8') | |
@arrayNoun << "#{node.surface}" | |
end | |
end | |
end | |
end | |
# 大きい順に並び替え | |
items = @arrayNoun.count.sort {|(k1, v1), (k2, v2)| v2 <=> v1 } | |
items.each do |k, v| | |
puts "#{k} : #{v}" if v >= border | |
end | |
end | |
end | |
# それぞれのインスタンス | |
googleNews = AnalyzeHTML.new(url[0],selector[0]) | |
# hatenaBlog = AnalyzeHTML.new(url[1],selector[1]) | |
# yahooNews = AnalyzeHTML.new(url[2],selector[2]) | |
# nichNews = AnalyzeHTML.new(url[3],selector[3]) | |
#リストアップの実行 | |
# googleNews.listUp | |
# googleNews.listUpFeatures | |
# googleNews.listUpNoun | |
googleNews.countUpNoun(3) | |
# hatenaBlog.countUpNoun(2) | |
# hatenaBlog.listUpNoun | |
# yahooNews.countUpNoun(3) | |
# nichNews.countUpNoun(3) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment