piroyoung/AnalyzeHTML

## AnalyzeHTML
#!/usr/bin/ruby
#! ruby -EUTF-8
# -*- mode:ruby; coding:utf-8 -*-
require 'MeCab'
require 'open-uri'
require 'nokogiri'

# スクレイピングするURL
url = []
selector = []
url << 'https://news.google.co.jp/'
selector << 'span.titletext'
url << 'http://hatenablog.com'
selector << 'a.entry-title'
url << 'http://news.yahoo.co.jp'
selector << 'body'
url << 'http://headline.2ch.net/bbynews/'
selector << 'a'


# 重複する数をカウントする
class Array
  def count
    k = Hash.new(0)
    self.each{|x| k[x] += 1 }
    return k
  end
end

class AnalyzeHTML
	attr_reader :text
	def initialize(url,selector)
		charset = nil
		#opne-uriを使用．文字コード取得
		html = open(url) do |f|
		  @charset = f.charset
		  f.read
		end

		#nokogiriで対象サイトを読み込む
		doc = Nokogiri::HTML.parse(html, nil, @charset)
		@text = []
		# 各行からhtmlタグを取り覗いて文字列の配列（メンバ変数）に格納
		doc.css(selector).each do |txt|
			@text << txt.to_s.gsub(/<("[^"]*"|'[^']*'|[^'">])*>/,'')
		end
		#形態素解析エンジンmecabのインスタンスを作成
		@m = MeCab::Tagger.new ("-Ochasen")
	end
	#各業を出力
	def listUp
		@text.each do |line|
			puts line
		end
	end

	#形態素解析の結果を出力
	def listUpFeatures
		@text.each do |line|
			puts @m.parse(line)
		end
	end

	#形態素解析の結果から名詞(Noun)を抜き出す
	def listUpNoun
		@text.each do |line|
			#分析対象業の解析結果をノード形式で取得
			node = @m.parseToNode(line)
			#node.stat == 3 : 文末のステータス
			until node.stat == 3 do
				node = node.next
				#空白を取り除いてカンマで分割　第一要素には品詞が来る
				feature = node.feature.to_s.gsub(/ /,'').split(",")
				#品詞が名詞なら中身(node.surface)を出力
				if /名詞/u =~ feature[0].force_encoding('utf-8')
					unless /数|記号|読点|括弧|接尾/u =~ feature[1].force_encoding('utf-8')
						puts "#{node.surface}"
					end
				end
			end
		end
	end

	def countUpNoun(border)
		@arrayNoun = []
		@text.each do |line|
			#分析対象業の解析結果をノード形式で取得
			node = @m.parseToNode(line)
			#node.stat == 3 : 文末のステータス
			until node.stat == 3 do
				node = node.next
				#空白を取り除いてカンマで分割　第一要素には品詞が来る
				feature = node.feature.to_s.gsub(/ /,'').split(",")
				#品詞が名詞なら中身(node.surface)を出力
				if /名詞/u =~ feature[0].force_encoding('utf-8')
					unless /数|記号|読点|括弧|接尾/u =~ feature[1].force_encoding('utf-8')
						@arrayNoun << "#{node.surface}"
					end
				end
			end
		end
		# 大きい順に並び替え
		items = @arrayNoun.count.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
		items.each do |k, v|
			puts "#{k} : #{v}" if v >= border
		end

	end
end
# それぞれのインスタンス
googleNews = AnalyzeHTML.new(url[0],selector[0])
# hatenaBlog = AnalyzeHTML.new(url[1],selector[1])
# yahooNews = AnalyzeHTML.new(url[2],selector[2])
# nichNews = AnalyzeHTML.new(url[3],selector[3])


#リストアップの実行
# googleNews.listUp
# googleNews.listUpFeatures
# googleNews.listUpNoun
googleNews.countUpNoun(3)
# hatenaBlog.countUpNoun(2)
# hatenaBlog.listUpNoun
# yahooNews.countUpNoun(3)
# nichNews.countUpNoun(3)
	#!/usr/bin/ruby
	#! ruby -EUTF-8
	# -- mode:ruby; coding:utf-8 --
	require 'MeCab'
	require 'open-uri'
	require 'nokogiri'

	# スクレイピングするURL
	url = []
	selector = []
	url << 'https://news.google.co.jp/'
	selector << 'span.titletext'
	url << 'http://hatenablog.com'
	selector << 'a.entry-title'
	url << 'http://news.yahoo.co.jp'
	selector << 'body'
	url << 'http://headline.2ch.net/bbynews/'
	selector << 'a'


	# 重複する数をカウントする
	class Array
	def count
	k = Hash.new(0)
	self.each{\|x\| k[x] += 1 }
	return k
	end
	end

	class AnalyzeHTML
	attr_reader :text
	def initialize(url,selector)
	charset = nil
	#opne-uriを使用．文字コード取得
	html = open(url) do \|f\|
	@charset = f.charset
	f.read
	end

	#nokogiriで対象サイトを読み込む
	doc = Nokogiri::HTML.parse(html, nil, @charset)
	@text = []
	# 各行からhtmlタグを取り覗いて文字列の配列（メンバ変数）に格納
	doc.css(selector).each do \|txt\|
	@text << txt.to_s.gsub(/<("[^"]"\|'[^']'\|[^'">])*>/,'')
	end
	#形態素解析エンジンmecabのインスタンスを作成
	@m = MeCab::Tagger.new ("-Ochasen")
	end
	#各業を出力
	def listUp
	@text.each do \|line\|
	puts line
	end
	end

	#形態素解析の結果を出力
	def listUpFeatures
	@text.each do \|line\|
	puts @m.parse(line)
	end
	end

	#形態素解析の結果から名詞(Noun)を抜き出す
	def listUpNoun
	@text.each do \|line\|
	#分析対象業の解析結果をノード形式で取得
	node = @m.parseToNode(line)
	#node.stat == 3 : 文末のステータス
	until node.stat == 3 do
	node = node.next
	#空白を取り除いてカンマで分割　第一要素には品詞が来る
	feature = node.feature.to_s.gsub(/ /,'').split(",")
	#品詞が名詞なら中身(node.surface)を出力
	if /名詞/u =~ feature[0].force_encoding('utf-8')
	unless /数\|記号\|読点\|括弧\|接尾/u =~ feature[1].force_encoding('utf-8')
	puts "#{node.surface}"
	end
	end
	end
	end
	end

	def countUpNoun(border)
	@arrayNoun = []
	@text.each do \|line\|
	#分析対象業の解析結果をノード形式で取得
	node = @m.parseToNode(line)
	#node.stat == 3 : 文末のステータス
	until node.stat == 3 do
	node = node.next
	#空白を取り除いてカンマで分割　第一要素には品詞が来る
	feature = node.feature.to_s.gsub(/ /,'').split(",")
	#品詞が名詞なら中身(node.surface)を出力
	if /名詞/u =~ feature[0].force_encoding('utf-8')
	unless /数\|記号\|読点\|括弧\|接尾/u =~ feature[1].force_encoding('utf-8')
	@arrayNoun << "#{node.surface}"
	end
	end
	end
	end
	# 大きい順に並び替え
	items = @arrayNoun.count.sort {\|(k1, v1), (k2, v2)\| v2 <=> v1 }
	items.each do \|k, v\|
	puts "#{k} : #{v}" if v >= border
	end

	end
	end
	# それぞれのインスタンス
	googleNews = AnalyzeHTML.new(url[0],selector[0])
	# hatenaBlog = AnalyzeHTML.new(url[1],selector[1])
	# yahooNews = AnalyzeHTML.new(url[2],selector[2])
	# nichNews = AnalyzeHTML.new(url[3],selector[3])


	#リストアップの実行
	# googleNews.listUp
	# googleNews.listUpFeatures
	# googleNews.listUpNoun
	googleNews.countUpNoun(3)
	# hatenaBlog.countUpNoun(2)
	# hatenaBlog.listUpNoun
	# yahooNews.countUpNoun(3)
	# nichNews.countUpNoun(3)