Skip to content

Instantly share code, notes, and snippets.

@chezou
Created December 4, 2014 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chezou/1f947423c6655c266e0a to your computer and use it in GitHub Desktop.
Save chezou/1f947423c6655c266e0a to your computer and use it in GitHub Desktop.
Word count benchmark using MeCab
# Ruby using -O wakati
require "mecab"
require "benchmark"
def count_word(text)
tagger = MeCab::Tagger.new("-O wakati")
counts = Hash.new(0)
tagger.parse(text).split.each do |e|
counts[e] += 1
end
counts
end
def main
if ARGV.size < 2
puts "File name required"
return
end
fname = ARGV.shift
Benchmark.bmbm do |x|
x.report {
10.times {
count_word(File.read(fname))
}
}
end
end
main
# Ruby using MeCab node
require "mecab"
require "benchmark"
def count_word(text)
tagger = MeCab::Tagger.new
counts = Hash.new(0)
node = tagger.parseToNode(text)
while node
if node.surface.empty?
node = node.next
next
end
counts[node.surface] += 1
node = node.next
end
counts
end
def main
if ARGV.size < 2
puts "File name required"
return
end
fname = ARGV.shift
Benchmark.bmbm do |x|
x.report {
10.times {
count_word(File.read(fname))
}
}
end
end
main
# Julia using -O wakati
using Benchmark
using MeCab
function count_word(text::UTF8String)
mecab = Mecab("-O wakati")
counts = Dict{UTF8String, Int}()
for word in split(sparse_tostr(mecab, text))
counts[word] = get(counts, word, 0) + 1
end
counts
end
function main()
if length(ARGS) < 1
println("File name required")
return
end
f = open(ARGS[1])
text::UTF8String
text = readall(f);
f1() = count_word(text)
println(benchmark(f1, "WordCount1", 10))
end
main()
# RMeCabFreq
library(RMeCab)
library(microbenchmark)
fname <- commandArgs(trailingOnly=TRUE)[1]
microbenchmark(
result <- RMeCabFreq(fname),
unit="s",
times=10L
)
# Julia using MeCab node
using Benchmark
using MeCab
function count_word(text::UTF8String)
mecab = Mecab()
counts = Dict{UTF8String, Int}()
for line in split(text, '\n')
_line::UTF8String
_line = line
for word in parse_surface2(mecab, _line)
counts[word] = get(counts, word, 0) + 1
end
end
counts
end
function main()
if length(ARGS) < 1
println("Require file name")
return
end
f = open(ARGS[1])
text::UTF8String
text = readall(f);
f1() = count_word(text)
println(benchmark(f1, "WordCount2", 10))
end
main()
# Julia considering gc
using MeCab
function count_word(text::UTF8String)
mecab = Mecab("-O wakati")
counts = Dict{UTF8String, Int}()
for word in split(sparse_tostr(mecab, text))
counts[word] = get(counts, word, 0) + 1
end
counts
end
function main()
if length(ARGS) < 1
println("File name required")
return
end
f = open(ARGS[1])
text::UTF8String
text = readall(f);
t = 0.0
for i in [1:10]
gc()
tic()
count_word(text)
t += toq()
end
println(t/10)
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment