Created
July 28, 2012 05:19
-
-
Save arika/3191903 to your computer and use it in GitHub Desktop.
Jekyll/Octopressの類似文書検索をGroongaで行う
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
# | |
# Groongaの類似文書検索機能を使って類似記事を抽出する。 | |
require 'tmpdir' | |
require 'groonga' | |
module Jekyll | |
class Site | |
alias process_without_groonga process | |
def process | |
Dir.mktmpdir do |tmpdir| | |
Groonga::Context.default_options = {encoding: :utf8} | |
Groonga::Database.create(path: "#{tmpdir}/similar_posts.db") | |
process_without_groonga | |
end | |
end | |
end | |
class Post | |
class << self | |
attr_accessor :groonga | |
end | |
alias to_liquid_without_similar_posts to_liquid | |
def to_liquid | |
to_liquid_without_similar_posts.deep_merge({ | |
"similar_posts" => self.similar_posts, | |
}) | |
end | |
def similar_posts | |
return @sim_posts if @sim_posts | |
@sim_posts = [] | |
index_text = proc do |post| | |
[ | |
post.data['title'], | |
post.tags, | |
post.categories, | |
post.content, | |
].flatten.map(&:to_s).join(' '). | |
gsub(/<!--.*?-->/mo, ' '). | |
gsub(/<[^>]+>/mo, ' '). | |
gsub(/{%.+?%}/mo, ' '). | |
gsub(/\s+/mo, ' ').strip | |
end | |
unless self.class.groonga | |
self.class.groonga = true | |
Groonga::Schema.create_table( | |
'Content', | |
type: :hash, | |
default_tokenizer: 'TokenMecab') | |
Groonga::Schema.change_table('Content') do |table| | |
table.text('body') | |
table.short_text('url') | |
end | |
Groonga::Schema.create_table( | |
'Index', | |
type: :patricia_trie, | |
key_normalize: true, | |
default_tokenizer: 'TokenMecab') | |
Groonga::Schema.change_table('Index') do |table| | |
table.index('Content.body') | |
end | |
self.site.posts.each_with_index do |x, i| | |
Groonga['Content'].add( | |
i.to_s, | |
body: index_text[x], | |
url: x.url) | |
end | |
end | |
records = Groonga['Content'].select do |record| | |
record.url == self.url | |
end | |
return [] if records.empty? | |
records = Groonga['Content'].select do |record| | |
record.body.similar_search(index_text[self]) | |
end | |
records.sort([{key: '_score', order: 'descending'}], limit: 10).each do |record| | |
post = self.site.posts[record.key.id] | |
next if post == self | |
@sim_posts << post | |
end | |
@sim_posts | |
rescue Exception | |
$stderr.puts "#{$!.message} (#{$!.class})" | |
$stderr.puts $!.backtrace[0, 5] | |
raise | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment