Skip to content

Instantly share code, notes, and snippets.

@abicky
Created May 31, 2015 01:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abicky/c39632a2cb0188459de2 to your computer and use it in GitHub Desktop.
Save abicky/c39632a2cb0188459de2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'optparse'
require 'crfsuite'
params = ARGV.getopts('', 'command:', 'model:')
FEATURE_TEMPLATES = [
[[-2, 0]],
[[-1, 0]],
[[ 0, 0]],
[[ 1, 0]],
[[ 2, 0]],
[[-1, 0], [0, 0]],
[[ 0, 0], [1, 0]],
[[-2, 1]],
[[-1, 1]],
[[ 0, 1]],
[[ 1, 1]],
[[ 2, 1]],
[[-2, 1], [-1, 1]],
[[-1, 1], [ 0, 1]],
[[ 0, 1], [ 1, 1]],
[[ 1, 1], [ 2, 1]],
[[-2, 1], [-1, 1], [0, 1]],
[[-1, 1], [ 0, 1], [1, 1]],
[[ 0, 1], [ 1, 1], [2, 1]],
].freeze
class Trainer < Crfsuite::Trainer
def message(s)
print s
end
end
def learn(model_file)
trainer = Trainer.new
yseq = Crfsuite::StringList.new
items = []
STDIN.each do |line|
line.chomp!
if line.empty?
xseq = build_sequence(items)
trainer.append(xseq, yseq, 0)
items.clear
yseq = Crfsuite::StringList.new
next
end
fields = line.split(' ')
yseq << fields.pop
items << fields
end
trainer.select('l2sgd', 'crf1d')
trainer.set('c2', '0.1')
trainer.train(model_file, -1)
end
def tag(model_file)
tagger = Crfsuite::Tagger.new
tagger.open(model_file)
items = []
STDIN.each do |line|
line.chomp!
if line.empty?
xseq = build_sequence(items)
tagger.set(xseq)
tagger.viterbi
items.clear
next
end
items << line.split(' ')[0..-2]
end
end
def build_sequence(items)
xseq = Crfsuite::ItemSequence.new
names = []
values = []
items.size.times do |offset|
item = Crfsuite::Item.new
FEATURE_TEMPLATES.each do |template|
names.clear
values.clear
available_feature = true
template.each do |field|
idx = offset + field[0]
if idx < 0 || idx >= items.size
available_feature = false
break
end
value = items[idx][field[1]]
if value.nil?
available_feature = false
break
end
names << 'x[%d,%d]' % field
values << value
end
next unless available_feature
item << Crfsuite::Attribute.new("#{names.join('|')}=#{values.join('|')}")
end
item << Crfsuite::Attribute.new('__BOS__') if offset.zero?
item << Crfsuite::Attribute.new('__EOS__') if offset == items.size - 1
xseq << item
end
xseq
end
case params['command']
when 'learn'
learn(params['model'])
when 'tag'
tag(params['model'])
else
raise ArgumentError, 'Unknown command'
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment