Last active
January 27, 2018 18:03
-
-
Save searls/47f67fd033f98a9eeef9dce178b7062d to your computer and use it in GitHub Desktop.
A little Ruby script to download WaniKani's ~2000 kanji and persist them in ActiveRecord
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require_relative "../config/environment" | |
def kanji_at(level) | |
api_key = ENV['WANIKANI_API_KEY'] | |
uri = URI("https://www.wanikani.com/api/user/#{api_key}/kanji/#{level}") | |
response = JSON.parse(Net::HTTP.get(uri)) | |
raise response["error"]["message"] if response.has_key?("error") | |
return response["requested_information"] | |
end | |
(1..60).each do |level| | |
puts "Processing level #{level}" | |
kanji_at(level).each do |kanji| | |
kanji_props = kanji.with_indifferent_access.except(:user_specific) | |
WanikaniKanji.find_or_create_by!(kanji_props.slice(:character)) do |wk_kanji| | |
wk_kanji.assign_attributes(kanji_props) | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require_relative "../config/environment" | |
require 'mojinizer' | |
puts "Sentences to process: #{Sentence.count}" | |
wanikani_kanji = WanikaniKanji.all | |
WK_MAX_LEVEL = 60 | |
missing_wk_kanji = [] | |
level_counts = Hash.new(0) | |
Sentence.where(:language => 'jpn').find_each.with_index do |sentence, i| | |
sentence_kanji = sentence.text.scan(/[\p{Han}]/).uniq | |
max_kanji_level = sentence_kanji.map do |kanji| | |
wk_kanji = wanikani_kanji.find { |wk_kanji| wk_kanji.character == kanji } | |
if wk_kanji.present? | |
wk_kanji.level | |
else | |
missing_wk_kanji << kanji | |
WK_MAX_LEVEL + 1 #<-- Gotta assume we consider these sentences "hard" | |
end | |
end.max || 0 | |
level_counts[max_kanji_level] += 1 | |
sentence.update!(:wk_level => max_kanji_level) | |
puts "Processed #{i+1} sentences" if (i+1) % 10_000 == 0 | |
end | |
puts "Total occurrences of kanji unknown to Wanikani: #{missing_wk_kanji.size}" | |
puts "Unique kanji unknown to Wanikani: #{missing_wk_kanji.uniq.size}" | |
puts "Sentence count by WK level:" | |
level_counts.keys.sort.each do |level| | |
puts " #{level} => #{level_counts[level]}" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ./script/add_wk_kanji_level_to_sentences | |
Sentences to process: 891800 | |
Processed 10000 sentences | |
Processed 20000 sentences | |
Processed 30000 sentences | |
Processed 40000 sentences | |
Processed 50000 sentences | |
Processed 60000 sentences | |
Processed 70000 sentences | |
Processed 80000 sentences | |
Processed 90000 sentences | |
Processed 100000 sentences | |
Processed 110000 sentences | |
Processed 120000 sentences | |
Processed 130000 sentences | |
Processed 140000 sentences | |
Processed 150000 sentences | |
Processed 160000 sentences | |
Processed 170000 sentences | |
Processed 180000 sentences | |
Total occurrences of kanji unknown to Wanikani: 5785 | |
Unique kanji unknown to Wanikani: 919 | |
Sentence count by WK level: | |
0 => 2062 | |
1 => 392 | |
2 => 729 | |
3 => 743 | |
4 => 1299 | |
5 => 2945 | |
6 => 2439 | |
7 => 1831 | |
8 => 3775 | |
9 => 4141 | |
10 => 3844 | |
11 => 2774 | |
12 => 3221 | |
13 => 2475 | |
14 => 9123 | |
15 => 4313 | |
16 => 4693 | |
17 => 2550 | |
18 => 3744 | |
19 => 2563 | |
20 => 3024 | |
21 => 2527 | |
22 => 3070 | |
23 => 2457 | |
24 => 3013 | |
25 => 2677 | |
26 => 3150 | |
27 => 3862 | |
28 => 2181 | |
29 => 1783 | |
30 => 2794 | |
31 => 2476 | |
32 => 3474 | |
33 => 2312 | |
34 => 2640 | |
35 => 40652 | |
36 => 2788 | |
37 => 2342 | |
38 => 3201 | |
39 => 1849 | |
40 => 1325 | |
41 => 1234 | |
42 => 1813 | |
43 => 3303 | |
44 => 3882 | |
45 => 2032 | |
46 => 1922 | |
47 => 975 | |
48 => 2170 | |
49 => 1073 | |
50 => 1658 | |
51 => 1190 | |
52 => 1341 | |
53 => 444 | |
54 => 539 | |
55 => 443 | |
56 => 676 | |
57 => 302 | |
58 => 319 | |
59 => 750 | |
60 => 224 | |
61 => 5111 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require_relative "../config/environment" | |
def sentences_at(level) | |
Sentence. | |
where(:language => 'jpn', :wk_level => level). | |
joins(:translations). | |
group('sentences.id') | |
end | |
require "csv" | |
MAX_WK_LEVEL=60 | |
presents_token = PresentsToken.new | |
path = "tmp/anki-sentence-deck.csv" | |
CSV.open(path, "wb") do |csv| | |
(1..MAX_WK_LEVEL).each do |level| | |
puts "Building sentences for level #{level}" | |
sentences_at(level).each do |sentence| | |
furigana = sentence.tokens.map {|t| presents_token.call(t).furigana_tokens} | |
reading = furigana.map do |furigana_pairs| | |
furigana_pairs.map do |furigana_pair| | |
"<ruby><rb>#{furigana_pair.text}<rt>#{furigana_pair.furigana}</ruby>" | |
end.join | |
end.join | |
translations = sentence.translations.select {|t| t.translation.english?} | |
url_escaped_sentence = ERB::Util.u(sentence.text) | |
back = <<~BACK | |
<h3>Reading:</h3> | |
<p> | |
#{reading} | |
</p> | |
<hr/> | |
<h3>Translation#{translations.size != 1 ? 's' : ''}:</h3> | |
<ul style="text-align: left"> | |
#{translations.map {|t| "<li>#{t.text}</li>"}.join} | |
</ul> | |
<hr/> | |
<h3>Actions:</h3> | |
<ul style="text-align: left"> | |
<li><a href="http://translate.google.com/#ja/en/#{url_escaped_sentence}">Google Translate</a></li> | |
<li><a href="midori://translate?text=#{url_escaped_sentence}">Midori.app</a></li> | |
<li><a href="japanese://search/#{url_escaped_sentence}">Japanese.app</a></li> | |
</ul> | |
BACK | |
tags = ["wanikani-level-#{level.to_s.rjust(2, "0")}"] | |
tags += (level..MAX_WK_LEVEL).map do |higher_level| | |
"wanikani-all-#{higher_level.to_s.rjust(2, "0")}" | |
end | |
csv << [sentence.text, back, tags.join(" ")] | |
end | |
end | |
end | |
puts "Deck written to #{"tmp/anki-sentence-deck.csv"}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment