KitaitiMakoto/similar.rb

## similar.rb
# coding: utf-8
require 'pp'
require 'logger'
require 'pathname'
require 'open-uri'
require 'nokogiri'
require 'groonga'

current_dir = Pathname(__dir__).expand_path
ORIGIN_URI = URI('http://ji-sedai.jp')
SERIAL_LP_URI = ORIGIN_URI + '/series/'
DATABASE_PATH = current_dir + 'database' + 'similar.db'
DATABASE_CONFIGURATION = {
  'database' => DATABASE_PATH,
  'encoding' => 'utf8'
}
CONTENTS_DIR = current_dir + 'contents'
MAIN_AREA_SELECTOR = '.mod-wysiwyg'

def main
  setup_database
  download_contents
  load_contents
  list_similar_contents
end

def setup_database
  DATABASE_PATH.dirname.mkpath

  Groonga::Database.create path: DATABASE_PATH.to_path
  Groonga::Context.default.register_plugin 'token_filters/stop_word'
  Groonga::Schema.create_table 'pages', type: :hash do |table|
    table.short_text 'title'
    table.text 'content'
  end
  Groonga::Schema.create_table 'pages_content_index', type: :patricia_trie, normalizer: :NormalizerAuto, default_tokenizer: :TokenMecab, token_filters: ['TokenFilterStopWord'] do |table|
    table.index 'pages.content', with_position: true
    table.boolean 'is_stop_word', type: :scalar
  end
  %w[の は 。 、 , .].each do |word|
    Groonga['pages_content_index'].add(word, is_stop_word: true)
  end
end

def download_contents
  CONTENTS_DIR.mkpath

  index_path = download(SERIAL_LP_URI)
  doc = Nokogiri.HTML(index_path.open)
  doc.css('.mod-more a').each do |link|
    path = download(link['href'])
    bdoc = Nokogiri.HTML(path.open)
    bdoc.css('.mod-entry .mod-cropimg a').each do |link|
      download link['href']
    end
  end
end

def load_contents
  Pathname.glob("#{CONTENTS_DIR}/**/*").each do |path|
    next unless path.file?
    next if path.basename.to_path == 'index.html'
    doc = Nokogiri.HTML(path.open)

    uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}"
    title = doc.css('title').first.content
    content = doc.css(MAIN_AREA_SELECTOR).first.content

    Groonga['pages'].add uri.to_s, title: title, content: content
  end
end

def list_similar_contents
  pages = {}
  Groonga['pages'].select().each do |record|
    pages[record['_key']] = {
      title: record.title,
      content: record.content
    }
  end
  pages.each_pair do |uri, data|
    puts "==================================="
    puts data[:title]
    puts uri
    puts "==================================="
    results = Groonga['pages'].select {|record|
      record['content'].similar_search data[:content]
    }
    results.sort([key: '_score', order: 'descending'], limit: 6).each do |record|
      next if record['_key'] == uri
      puts " * %s(%s) - %s" % [record['title'], record['_key'], record['_score']]
    end
  end
end

def download(uri)
  uri = URI(uri)
  path = uri.path[1..-1]
  if uri.path.end_with? '/'
    path += 'index.html'
  end
  path = CONTENTS_DIR.join(path)
  unless path.dirname.directory?
    $stderr.puts "Making directory: #{path.dirname}..."
    path.dirname.mkpath
  end
  if path.file?
    $stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading"
  else
    $stderr.puts "Downloading: #{uri} -> #{path}"
    path.write uri.read
    sleep 1
  end
  path
end

if $0 == __FILE__
  main
end
	# coding: utf-8
	require 'pp'
	require 'logger'
	require 'pathname'
	require 'open-uri'
	require 'nokogiri'
	require 'groonga'

	current_dir = Pathname(__dir__).expand_path
	ORIGIN_URI = URI('http://ji-sedai.jp')
	SERIAL_LP_URI = ORIGIN_URI + '/series/'
	DATABASE_PATH = current_dir + 'database' + 'similar.db'
	DATABASE_CONFIGURATION = {
	'database' => DATABASE_PATH,
	'encoding' => 'utf8'
	}
	CONTENTS_DIR = current_dir + 'contents'
	MAIN_AREA_SELECTOR = '.mod-wysiwyg'

	def main
	setup_database
	download_contents
	load_contents
	list_similar_contents
	end

	def setup_database
	DATABASE_PATH.dirname.mkpath

	Groonga::Database.create path: DATABASE_PATH.to_path
	Groonga::Context.default.register_plugin 'token_filters/stop_word'
	Groonga::Schema.create_table 'pages', type: :hash do \|table\|
	table.short_text 'title'
	table.text 'content'
	end
	Groonga::Schema.create_table 'pages_content_index', type: :patricia_trie, normalizer: :NormalizerAuto, default_tokenizer: :TokenMecab, token_filters: ['TokenFilterStopWord'] do \|table\|
	table.index 'pages.content', with_position: true
	table.boolean 'is_stop_word', type: :scalar
	end
	%w[のは。、 , .].each do \|word\|
	Groonga['pages_content_index'].add(word, is_stop_word: true)
	end
	end

	def download_contents
	CONTENTS_DIR.mkpath

	index_path = download(SERIAL_LP_URI)
	doc = Nokogiri.HTML(index_path.open)
	doc.css('.mod-more a').each do \|link\|
	path = download(link['href'])
	bdoc = Nokogiri.HTML(path.open)
	bdoc.css('.mod-entry .mod-cropimg a').each do \|link\|
	download link['href']
	end
	end
	end

	def load_contents
	Pathname.glob("#{CONTENTS_DIR}/*/").each do \|path\|
	next unless path.file?
	next if path.basename.to_path == 'index.html'
	doc = Nokogiri.HTML(path.open)

	uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}"
	title = doc.css('title').first.content
	content = doc.css(MAIN_AREA_SELECTOR).first.content

	Groonga['pages'].add uri.to_s, title: title, content: content
	end
	end

	def list_similar_contents
	pages = {}
	Groonga['pages'].select().each do \|record\|
	pages[record['_key']] = {
	title: record.title,
	content: record.content
	}
	end
	pages.each_pair do \|uri, data\|
	puts "==================================="
	puts data[:title]
	puts uri
	puts "==================================="
	results = Groonga['pages'].select {\|record\|
	record['content'].similar_search data[:content]
	}
	results.sort([key: '_score', order: 'descending'], limit: 6).each do \|record\|
	next if record['_key'] == uri
	puts " * %s(%s) - %s" % [record['title'], record['_key'], record['_score']]
	end
	end
	end

	def download(uri)
	uri = URI(uri)
	path = uri.path[1..-1]
	if uri.path.end_with? '/'
	path += 'index.html'
	end
	path = CONTENTS_DIR.join(path)
	unless path.dirname.directory?
	$stderr.puts "Making directory: #{path.dirname}..."
	path.dirname.mkpath
	end
	if path.file?
	$stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading"
	else
	$stderr.puts "Downloading: #{uri} -> #{path}"
	path.write uri.read
	sleep 1
	end
	path
	end

	if $0 == __FILE__
	main
	end