Created
February 25, 2018 05:38
-
-
Save Jagua/daeaca9c1adc04769f66ffb87eb3a2b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scriptencoding utf-8 | |
" gen_gorin_dict.vim - Generate dictionaries of athletes from gorin.jp | |
" | |
" generate two files: | |
" 1. gorin%Y.migemodict | |
" 2. gorin%Y_msime.txt | |
" | |
" usage: | |
" vim -N -u NONE -i NONE -V1 -e -s -X --cmd "source gen_gorin_dict.vim" --cmd qall! | |
" (wait about 100 seconds...) | |
" Note: replace /path/to/your/webapi-vim | |
set runtimepath+=~/.cache/dein/repos/github.com/mattn/webapi-vim | |
function! s:get_urls() abort | |
let url = 'https://www.gorin.jp/athlete/' | |
let root_node = webapi#html#parseURL(url) | |
let wordsNodes = root_node | |
\.find('div', {'class' : 'select-word'}).childNode('ul').childNodes('li') | |
let words_urls = [] | |
for n in wordsNodes | |
let relative_path = n.childNode('a').attr['href'] | |
let url = 'https://www.gorin.jp' . relative_path | |
call add(words_urls, url) | |
endfor | |
return words_urls | |
endfunction | |
function! s:get_athleate_data(relative_path) abort | |
let url = 'https://www.gorin.jp' . a:relative_path | |
let root_node = webapi#html#parseURL(url) | |
let prof_node = root_node.find('div', {'class' : 'heading profile'}) | |
let h2_node = prof_node.find('h2') | |
let name_kanji = h2_node.child[0] | |
let name_kana = h2_node.child[1].value() | |
if empty(name_kana) | |
return {} | |
endif | |
let [last_name_kanji, first_name_kanji] = split(name_kanji, ' ') | |
let [last_name_kana, first_name_kana] = split(name_kana, ' ') | |
let full_name_kanji = last_name_kanji . first_name_kanji | |
let full_name_kana = last_name_kana . first_name_kana | |
return { | |
\ 'last_name_kanji' : last_name_kanji, | |
\ 'last_name_kana' : last_name_kana, | |
\ 'first_name_kanji' : first_name_kanji, | |
\ 'first_name_kana' : first_name_kana, | |
\ 'full_name_kanji' : full_name_kanji, | |
\ 'full_name_kana' : full_name_kana, | |
\} | |
endfunction | |
function! s:make_msime_lines(athlete_data) abort | |
let o = a:athlete_data | |
return [ | |
\ printf("%s\t%s\t%s", o.last_name_kana, o.last_name_kanji, '姓'), | |
\ printf("%s\t%s\t%s", o.first_name_kana, o.first_name_kanji, '名'), | |
\ printf("%s\t%s\t%s", o.full_name_kana, o.full_name_kanji, '人名'), | |
\] | |
endfunction | |
function! s:make_migemodict_lines(athlete_data) abort | |
let o = a:athlete_data | |
return [ | |
\ printf("%s\t%s", o.last_name_kana, o.last_name_kanji), | |
\ printf("%s\t%s", o.first_name_kana, o.first_name_kanji), | |
\ printf("%s\t%s", o.full_name_kana, o.full_name_kanji), | |
\] | |
endfunction | |
function! s:gather_athlete_data(url) abort | |
let athlete_data = [] | |
let root_node = webapi#html#parseURL(a:url) | |
let list_nodes = root_node | |
\.find('section', {'class' : 'data-block athletes'}) | |
\.childNode('div', {'class' : 'list'}) | |
\.childNode('ul').childNodes('li') | |
for n in list_nodes | |
let relative_path = n.childNode('a').attr['href'] | |
let data = s:get_athleate_data(relative_path) | |
if empty(data) | |
continue | |
endif | |
call add(athlete_data, data) | |
sleep 10m | |
endfor | |
return athlete_data | |
endfunction | |
function! s:gather_data() abort | |
let data = [] | |
for url in s:get_urls() | |
call extend(data, s:gather_athlete_data(url)) | |
endfor | |
return data | |
endfunction | |
function! s:main() abort | |
let data = s:gather_data() | |
let year = strftime('%Y') | |
let migemodict_lines = [] | |
call map(copy(data), 'extend(migemodict_lines, s:make_migemodict_lines(v:val))') | |
call writefile(migemodict_lines, printf('gorin%s.migemodict', year)) | |
let msime_lines = [] | |
call map(copy(data), 'extend(msime_lines, s:make_msime_lines(v:val))') | |
call map(msime_lines, 'iconv(v:val, "utf-8", "cp932") . "\r"') | |
call writefile(msime_lines, printf('gorin%s_msime.txt', year)) | |
echo 'done.' | |
endfunction | |
call s:main() | |
" vim: set et sw=0 ts=2 fdm=indent: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment