Last active
August 29, 2015 14:20
-
-
Save zosiu/e6622c0c5290dd66b875 to your computer and use it in GitHub Desktop.
Analyzed random japanese senctence from tatoeba.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mechanize' | |
require 'kuromoji' | |
dom = Mechanize.new.get("http://tatoeba.org/eng/sentences/show/jpn") | |
jp_sentence = dom.parser.at("//div[@lang = 'ja']").text | |
# => "万一に備えて傘を持っていった方がいいだろうな。" | |
jp_sentence_w_furigana = dom.parser.at("//div[@lang = 'ja-Latn']").text | |
# => "万一[まんいち] に[] 備え[そなえ] て[] 傘[かさ] を[] 持っ[もっ] て[] いっ[] た[] 方[ほう] が[] いい[] だろ[] う[] な[] 。[]" | |
en_sentence = dom.parser.at("//div[@lang = 'en']").text | |
# => "Perhaps I should take an umbrella with me just in case." | |
Kuromoji.tokenize jp_sentence | |
# => | |
# "万一": "副詞,助詞類接続,*,*,*,*,万一,マンイチ,マンイチ" | |
# "に": "助詞,副詞化,*,*,*,*,に,ニ,ニ" | |
# "備え": "動詞,自立,*,*,一段,連用形,備える,ソナエ,ソナエ" | |
# "て": "助詞,接続助詞,*,*,*,*,て,テ,テ" | |
# "傘": "名詞,一般,*,*,*,*,傘,カサ,カサ" | |
# "を": "助詞,格助詞,一般,*,*,*,を,ヲ,ヲ" | |
# "持っ": "動詞,自立,*,*,五段・タ行,連用タ接続,持つ,モッ,モッ" | |
# "いっ": "動詞,非自立,*,*,五段・カ行促音便,連用タ接続,いく,イッ,イッ" | |
# "た": "助動詞,*,*,*,特殊・タ,基本形,た,タ,タ" | |
# "方": "名詞,非自立,一般,*,*,*,方,ホウ,ホー" | |
# "が": "助詞,格助詞,一般,*,*,*,が,ガ,ガ" | |
# "いい": "形容詞,自立,*,*,形容詞・イイ,基本形,いい,イイ,イイ" | |
# "だろ": "助動詞,*,*,*,特殊・ダ,未然形,だ,ダロ,ダロ" | |
# "う": "助動詞,*,*,*,不変化型,基本形,う,ウ,ウ" | |
# "な": "助詞,終助詞,*,*,*,*,な,ナ,ナ" | |
# "。": "記号,句点,*,*,*,*,。,。,。" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment