Last active
August 29, 2015 14:01
-
-
Save denvazh/1ca7fa179b07bc0ba85d to your computer and use it in GitHub Desktop.
Small example for conversion of file with kanji strings to csv file with corresponding readings in katakana, hiragana and romaji.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A sample Gemfile | |
source "https://rubygems.org" | |
gem "natto" | |
gem "romaji" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'nkf' | |
require 'optparse' | |
require 'csv' | |
require 'bundler/setup' | |
require 'natto' | |
require 'romaji' | |
require 'romaji/core_ext/string' | |
$nm = Natto::MeCab.new | |
# Takes kanji string and converts to its equivalent in katakana | |
# | |
def to_katakana(s) | |
arr =[] | |
$nm.parse(s) do |n| | |
if n.char_type==2 | |
yomi = n.feature.split(',')[-2] | |
arr << yomi | |
else | |
arr << n.surface | |
end | |
end | |
arr.join | |
end | |
# Takes string in katakana and converts it to hiragana | |
# | |
def to_hiragana(s) | |
NKF.nkf('-h1 -w', s) | |
end | |
# Parse options entered by user | |
# | |
options = {} | |
opt_parser =OptionParser.new do |opts| | |
opts.banner = "Usage: kanji2csv.rb [options]" | |
opts.on("-f", "--file FILENAME", String, "File with new line separated list of kanji") do |f| | |
options[:file] = f | |
end | |
opts.on("-h", "--help", "Show help") do |h| | |
options[:help] = h | |
puts opt_parser | |
exit | |
end | |
end | |
begin | |
opt_parser.parse! | |
if (options[:file]) | |
if File.exists?(options[:file]) && File.file?(options[:file]) | |
srcfile =options[:file] | |
basename =File.basename(srcfile) | |
ext ="." << basename.split('.').last | |
csvfile =File.join(File.dirname(srcfile), "#{File.basename(basename, ext)}.csv") | |
CSV.open(csvfile, "wb") do |csv| | |
File.open(srcfile, "r").each_line do |line| | |
kanji_str =line.chomp! | |
katakana_str =to_katakana(kanji_str) | |
hiragana_str =to_hiragana(katakana_str) | |
romaji_str =katakana_str.romaji | |
csv << [kanji_str, katakana_str, hiragana_str, romaji_str] | |
end | |
end | |
else | |
puts "File #{options[:file]} not found!" | |
exit | |
end | |
else | |
puts opt_parser | |
exit | |
end | |
rescue OptionParser::InvalidOption, OptionParser::MissingArgument | |
puts $!.to_s | |
puts opt_parser | |
exit | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requirements
On OS X it can be installed with homebrew
Installing
Usage
Suppose we have file with the following text in kanji ( with filename kanji.txt ):
We can use the script like follows
This will produce the following output