Skip to content

Instantly share code, notes, and snippets.

@arika
Created July 28, 2012 05:19
Show Gist options
  • Save arika/3191903 to your computer and use it in GitHub Desktop.
Save arika/3191903 to your computer and use it in GitHub Desktop.
Jekyll/Octopressの類似文書検索をGroongaで行う
# encoding: utf-8
#
# Groongaの類似文書検索機能を使って類似記事を抽出する。
require 'tmpdir'
require 'groonga'
module Jekyll
class Site
alias process_without_groonga process
def process
Dir.mktmpdir do |tmpdir|
Groonga::Context.default_options = {encoding: :utf8}
Groonga::Database.create(path: "#{tmpdir}/similar_posts.db")
process_without_groonga
end
end
end
class Post
class << self
attr_accessor :groonga
end
alias to_liquid_without_similar_posts to_liquid
def to_liquid
to_liquid_without_similar_posts.deep_merge({
"similar_posts" => self.similar_posts,
})
end
def similar_posts
return @sim_posts if @sim_posts
@sim_posts = []
index_text = proc do |post|
[
post.data['title'],
post.tags,
post.categories,
post.content,
].flatten.map(&:to_s).join(' ').
gsub(/<!--.*?-->/mo, ' ').
gsub(/<[^>]+>/mo, ' ').
gsub(/{%.+?%}/mo, ' ').
gsub(/\s+/mo, ' ').strip
end
unless self.class.groonga
self.class.groonga = true
Groonga::Schema.create_table(
'Content',
type: :hash,
default_tokenizer: 'TokenMecab')
Groonga::Schema.change_table('Content') do |table|
table.text('body')
table.short_text('url')
end
Groonga::Schema.create_table(
'Index',
type: :patricia_trie,
key_normalize: true,
default_tokenizer: 'TokenMecab')
Groonga::Schema.change_table('Index') do |table|
table.index('Content.body')
end
self.site.posts.each_with_index do |x, i|
Groonga['Content'].add(
i.to_s,
body: index_text[x],
url: x.url)
end
end
records = Groonga['Content'].select do |record|
record.url == self.url
end
return [] if records.empty?
records = Groonga['Content'].select do |record|
record.body.similar_search(index_text[self])
end
records.sort([{key: '_score', order: 'descending'}], limit: 10).each do |record|
post = self.site.posts[record.key.id]
next if post == self
@sim_posts << post
end
@sim_posts
rescue Exception
$stderr.puts "#{$!.message} (#{$!.class})"
$stderr.puts $!.backtrace[0, 5]
raise
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment