Skip to content

Instantly share code, notes, and snippets.

@m7v
Created June 27, 2019 10:05
Show Gist options
  • Save m7v/abc703a0076daad2929f5eac63336d6b to your computer and use it in GitHub Desktop.
Save m7v/abc703a0076daad2929f5eac63336d6b to your computer and use it in GitHub Desktop.
Parse Jisho site for extracting kangi and its translations.
const jsdom = require("jsdom");
const request = require("request");
const { JSDOM } = jsdom;
var items = [];
var page = 1;
var pageSize = 20;
var level = 'jlpt-n5';
var buildPageUrl = (l, p) => `https://jisho.org/search/%23${l}%20%23kanji?page=${p}`;
var parseCountResult = function (html) {
const dom = new JSDOM(html);
var list = dom.window.document.querySelector('.result_count').textContent;
return list.match(/(\d+)/)[0];
}
var parseKanji = function (html) {
const dom = new JSDOM(html);
var list = dom.window.document.querySelectorAll('.entry.kanji_light.clearfix');
list.forEach((item) => {
var sign = item.querySelector('.character.literal.japanese_gothic a').text.trim();
var kun = item.querySelector('.kun.readings');
var on = item.querySelector('.on.readings');
var meaning = item.querySelector('.meanings.english.sense').textContent.trim();
items.push({
sign,
kun: kun ? kun.textContent.trim() : '',
on: on ? on.textContent.trim() : '',
meaning,
});
});
};
request(`https://jisho.org/search/%23${level}%20%23kanji`, function (error, response, body) {
if (!error) {
const promises = [];
const count = parseCountResult(body);
const pages = Math.ceil(count / pageSize);
console.log(`Need to parse ${count} pages for level ${level}`);
for (let currentPage = page; currentPage <= pages; currentPage++) {
const p = new Promise((resolve) => {
request(buildPageUrl(level, currentPage), function (error, response, body) {
if (!error) {
console.log(`Page ${currentPage} starting parse`);
parseKanji(body);
console.log(`Page ${currentPage} finished parse`);
console.log(`Items parsed: ${items.length}`);
} else {
console.log(error);
}
return resolve();
})
});
promises.push(p);
}
Promise.all(promises).then(() => {
console.log(`Total items ${items.length}`);
console.log(JSON.stringify(items));
});
} else {
console.log(error);
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment