Skip to content

Instantly share code, notes, and snippets.

@KitaitiMakoto
Last active September 24, 2015 15:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KitaitiMakoto/37d162b4f030d6c73e11 to your computer and use it in GitHub Desktop.
Save KitaitiMakoto/37d162b4f030d6c73e11 to your computer and use it in GitHub Desktop.
# coding: utf-8
require 'pp'
require 'logger'
require 'pathname'
require 'open-uri'
require 'nokogiri'
require 'groonga'
current_dir = Pathname(__dir__).expand_path
ORIGIN_URI = URI('http://ji-sedai.jp')
SERIAL_LP_URI = ORIGIN_URI + '/series/'
DATABASE_PATH = current_dir + 'database' + 'similar.db'
DATABASE_CONFIGURATION = {
'database' => DATABASE_PATH,
'encoding' => 'utf8'
}
CONTENTS_DIR = current_dir + 'contents'
MAIN_AREA_SELECTOR = '.mod-wysiwyg'
def main
setup_database
download_contents
load_contents
list_similar_contents
end
def setup_database
DATABASE_PATH.dirname.mkpath
Groonga::Database.create path: DATABASE_PATH.to_path
Groonga::Context.default.register_plugin 'token_filters/stop_word'
Groonga::Schema.create_table 'pages', type: :hash do |table|
table.short_text 'title'
table.text 'content'
end
Groonga::Schema.create_table 'pages_content_index', type: :patricia_trie, normalizer: :NormalizerAuto, default_tokenizer: :TokenMecab, token_filters: ['TokenFilterStopWord'] do |table|
table.index 'pages.content', with_position: true
table.boolean 'is_stop_word', type: :scalar
end
%w[の は 。 、 , .].each do |word|
Groonga['pages_content_index'].add(word, is_stop_word: true)
end
end
def download_contents
CONTENTS_DIR.mkpath
index_path = download(SERIAL_LP_URI)
doc = Nokogiri.HTML(index_path.open)
doc.css('.mod-more a').each do |link|
path = download(link['href'])
bdoc = Nokogiri.HTML(path.open)
bdoc.css('.mod-entry .mod-cropimg a').each do |link|
download link['href']
end
end
end
def load_contents
Pathname.glob("#{CONTENTS_DIR}/**/*").each do |path|
next unless path.file?
next if path.basename.to_path == 'index.html'
doc = Nokogiri.HTML(path.open)
uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}"
title = doc.css('title').first.content
content = doc.css(MAIN_AREA_SELECTOR).first.content
Groonga['pages'].add uri.to_s, title: title, content: content
end
end
def list_similar_contents
pages = {}
Groonga['pages'].select().each do |record|
pages[record['_key']] = {
title: record.title,
content: record.content
}
end
pages.each_pair do |uri, data|
puts "==================================="
puts data[:title]
puts uri
puts "==================================="
results = Groonga['pages'].select {|record|
record['content'].similar_search data[:content]
}
results.sort([key: '_score', order: 'descending'], limit: 6).each do |record|
next if record['_key'] == uri
puts " * %s(%s) - %s" % [record['title'], record['_key'], record['_score']]
end
end
end
def download(uri)
uri = URI(uri)
path = uri.path[1..-1]
if uri.path.end_with? '/'
path += 'index.html'
end
path = CONTENTS_DIR.join(path)
unless path.dirname.directory?
$stderr.puts "Making directory: #{path.dirname}..."
path.dirname.mkpath
end
if path.file?
$stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading"
else
$stderr.puts "Downloading: #{uri} -> #{path}"
path.write uri.read
sleep 1
end
path
end
if $0 == __FILE__
main
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment