Last active
September 24, 2015 15:06
-
-
Save KitaitiMakoto/37d162b4f030d6c73e11 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'pp' | |
require 'logger' | |
require 'pathname' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'groonga' | |
current_dir = Pathname(__dir__).expand_path | |
ORIGIN_URI = URI('http://ji-sedai.jp') | |
SERIAL_LP_URI = ORIGIN_URI + '/series/' | |
DATABASE_PATH = current_dir + 'database' + 'similar.db' | |
DATABASE_CONFIGURATION = { | |
'database' => DATABASE_PATH, | |
'encoding' => 'utf8' | |
} | |
CONTENTS_DIR = current_dir + 'contents' | |
MAIN_AREA_SELECTOR = '.mod-wysiwyg' | |
def main | |
setup_database | |
download_contents | |
load_contents | |
list_similar_contents | |
end | |
def setup_database | |
DATABASE_PATH.dirname.mkpath | |
Groonga::Database.create path: DATABASE_PATH.to_path | |
Groonga::Context.default.register_plugin 'token_filters/stop_word' | |
Groonga::Schema.create_table 'pages', type: :hash do |table| | |
table.short_text 'title' | |
table.text 'content' | |
end | |
Groonga::Schema.create_table 'pages_content_index', type: :patricia_trie, normalizer: :NormalizerAuto, default_tokenizer: :TokenMecab, token_filters: ['TokenFilterStopWord'] do |table| | |
table.index 'pages.content', with_position: true | |
table.boolean 'is_stop_word', type: :scalar | |
end | |
%w[の は 。 、 , .].each do |word| | |
Groonga['pages_content_index'].add(word, is_stop_word: true) | |
end | |
end | |
def download_contents | |
CONTENTS_DIR.mkpath | |
index_path = download(SERIAL_LP_URI) | |
doc = Nokogiri.HTML(index_path.open) | |
doc.css('.mod-more a').each do |link| | |
path = download(link['href']) | |
bdoc = Nokogiri.HTML(path.open) | |
bdoc.css('.mod-entry .mod-cropimg a').each do |link| | |
download link['href'] | |
end | |
end | |
end | |
def load_contents | |
Pathname.glob("#{CONTENTS_DIR}/**/*").each do |path| | |
next unless path.file? | |
next if path.basename.to_path == 'index.html' | |
doc = Nokogiri.HTML(path.open) | |
uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}" | |
title = doc.css('title').first.content | |
content = doc.css(MAIN_AREA_SELECTOR).first.content | |
Groonga['pages'].add uri.to_s, title: title, content: content | |
end | |
end | |
def list_similar_contents | |
pages = {} | |
Groonga['pages'].select().each do |record| | |
pages[record['_key']] = { | |
title: record.title, | |
content: record.content | |
} | |
end | |
pages.each_pair do |uri, data| | |
puts "===================================" | |
puts data[:title] | |
puts uri | |
puts "===================================" | |
results = Groonga['pages'].select {|record| | |
record['content'].similar_search data[:content] | |
} | |
results.sort([key: '_score', order: 'descending'], limit: 6).each do |record| | |
next if record['_key'] == uri | |
puts " * %s(%s) - %s" % [record['title'], record['_key'], record['_score']] | |
end | |
end | |
end | |
def download(uri) | |
uri = URI(uri) | |
path = uri.path[1..-1] | |
if uri.path.end_with? '/' | |
path += 'index.html' | |
end | |
path = CONTENTS_DIR.join(path) | |
unless path.dirname.directory? | |
$stderr.puts "Making directory: #{path.dirname}..." | |
path.dirname.mkpath | |
end | |
if path.file? | |
$stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading" | |
else | |
$stderr.puts "Downloading: #{uri} -> #{path}" | |
path.write uri.read | |
sleep 1 | |
end | |
path | |
end | |
if $0 == __FILE__ | |
main | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment