Created
May 19, 2013 07:59
-
-
Save nebuta/5607014 to your computer and use it in GitHub Desktop.
Tweets classification test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
$host = "localhost" | |
$port = 9199 | |
$name = "twitter_user" | |
require 'json' | |
require 'find' | |
require 'jubatus/classifier/client' | |
require 'jubatus/classifier/types' | |
def train(client,shown) | |
# prepare training data | |
# predict the last ones (that are commented out) | |
files = ["tweet_kenichiromogi","tweet_t_ishin"] | |
train_data = [] | |
files.each{|e| | |
puts "Samples: " + e if shown | |
Find.find(e){|f| | |
next if File.directory? f | |
next if File.extname(f) != ".txt" | |
print File.basename(f,".txt") + " " if shown | |
ts = IO.readlines(f).join("\n").split("\n\n\n") | |
ts.each{|t| | |
train_data += [[e,Jubatus::Classifier::Datum.new([["tweet",t]],[])]] | |
} | |
} | |
puts if shown | |
} | |
# training data must be shuffled on online learning! | |
train_data.sort_by{rand} | |
# run train | |
r = client.train($name, train_data) | |
puts(r.to_s + " samples.") if shown | |
end | |
def predict(client) | |
# predict the last shogun | |
result = [] | |
data = [] | |
Find.find("tweet_unknown"){|f| | |
next if File.directory? f | |
next if File.extname(f) != ".txt" | |
ts = IO.readlines(f).join("\n").split("\n\n\n").map{|t| t.chomp}.delete_if{|t| t == ''} | |
ts.each{|t| | |
data << Jubatus::Classifier::Datum.new([["tweet",t],["filename",File.basename(f,".txt")]],[]) | |
} | |
} | |
# p data | |
res = client.classify($name, data) | |
# p res | |
data.each { |d| | |
res = client.classify($name, [d]) | |
# get the predicted genre | |
# p d | |
# p res | |
result << [d.string_values[0][1],res[0].max{ |x, y| x[1] <=> y[1]}[0].split('_')[1..100].join('_')] | |
puts result.last[1] | |
} | |
result.sort_by{|e| e[1]}.each{|r| | |
puts r[0] | |
puts "--->" + r[1] | |
puts | |
} | |
end | |
# connect to the jubatus | |
client = Jubatus::Classifier::Client::Classifier.new($host, $port) | |
# run example | |
train(client,true) | |
train(client,false) | |
sleep(1) | |
predict(client) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment