Created
June 27, 2019 10:05
-
-
Save m7v/abc703a0076daad2929f5eac63336d6b to your computer and use it in GitHub Desktop.
Parse Jisho site for extracting kangi and its translations.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const jsdom = require("jsdom"); | |
const request = require("request"); | |
const { JSDOM } = jsdom; | |
var items = []; | |
var page = 1; | |
var pageSize = 20; | |
var level = 'jlpt-n5'; | |
var buildPageUrl = (l, p) => `https://jisho.org/search/%23${l}%20%23kanji?page=${p}`; | |
var parseCountResult = function (html) { | |
const dom = new JSDOM(html); | |
var list = dom.window.document.querySelector('.result_count').textContent; | |
return list.match(/(\d+)/)[0]; | |
} | |
var parseKanji = function (html) { | |
const dom = new JSDOM(html); | |
var list = dom.window.document.querySelectorAll('.entry.kanji_light.clearfix'); | |
list.forEach((item) => { | |
var sign = item.querySelector('.character.literal.japanese_gothic a').text.trim(); | |
var kun = item.querySelector('.kun.readings'); | |
var on = item.querySelector('.on.readings'); | |
var meaning = item.querySelector('.meanings.english.sense').textContent.trim(); | |
items.push({ | |
sign, | |
kun: kun ? kun.textContent.trim() : '', | |
on: on ? on.textContent.trim() : '', | |
meaning, | |
}); | |
}); | |
}; | |
request(`https://jisho.org/search/%23${level}%20%23kanji`, function (error, response, body) { | |
if (!error) { | |
const promises = []; | |
const count = parseCountResult(body); | |
const pages = Math.ceil(count / pageSize); | |
console.log(`Need to parse ${count} pages for level ${level}`); | |
for (let currentPage = page; currentPage <= pages; currentPage++) { | |
const p = new Promise((resolve) => { | |
request(buildPageUrl(level, currentPage), function (error, response, body) { | |
if (!error) { | |
console.log(`Page ${currentPage} starting parse`); | |
parseKanji(body); | |
console.log(`Page ${currentPage} finished parse`); | |
console.log(`Items parsed: ${items.length}`); | |
} else { | |
console.log(error); | |
} | |
return resolve(); | |
}) | |
}); | |
promises.push(p); | |
} | |
Promise.all(promises).then(() => { | |
console.log(`Total items ${items.length}`); | |
console.log(JSON.stringify(items)); | |
}); | |
} else { | |
console.log(error); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment