Skip to content

Instantly share code, notes, and snippets.

@sujoyu
Last active October 30, 2019 02:58
Show Gist options
  • Save sujoyu/b93e83596e6de9142933ce7cc9cd711b to your computer and use it in GitHub Desktop.
Save sujoyu/b93e83596e6de9142933ce7cc9cd711b to your computer and use it in GitHub Desktop.
Node.js, MeCabを使用した日本語の辞書順ソート用compare関数。漢字、ひらがな、カタカナに対応しています。
const exec = require('child_process').execSync
const isWindows = require('is-windows')
let Encoding
if (isWindows()) {
Encoding = require('encoding-japanese')
}
function hiraToKana(str) {
return str.replace(/[\u3041-\u3096]/g, function(match) {
var chr = match.charCodeAt(0) + 0x60;
return String.fromCharCode(chr);
});
}
const dakuMap = hiraToKana('かが きぎ くぐ けげ こご さざ しじ すず せぜ そぞ ただ ちぢ つづ てで とど はばぱ ひびぴ ふぶぷ へべぺ ほぼぽ')
.split(/\s/g)
.reduce((prev, current) => {
[...current].map(c => {
prev[c] = current
})
return prev
}, {})
function compareKana(a, b) {
if (a[0] === undefined && b[0] === undefined) {
return 0
} else if (a[0] === undefined) {
return -1
} else if (b[0] === undefined) {
return 1
} else if (a[0] === b[0]) {
return compareKana(a.substring(1), b.substring(1))
} else if (
dakuMap[a[0]] && dakuMap[b[0]] && dakuMap[a[0]] === dakuMap[b[0]]
) {
return dakuMap[a[0]].indexOf(a[0]) - dakuMap[a[0]].indexOf(b[0])
} else {
return a[0].localeCompare(b[0], 'ja')
}
}
class JapaneseComparator {
constructor(command, resultParser, noAutoCache) {
if (isWindows()) {
this.detected = Encoding.detect(exec(`echo テキストエンコーディングのテスト | mecab -Oyomi`))
}
this.splitter = '\\\\\\'
this.command = command || 'mecab -Oyomi'
this.resultParser = resultParser || (output => output.trim())
this.noAutoCache = noAutoCache
this.cache = {}
}
execCommand(str) {
const command = `echo ${str} | ${this.command}`
let result
if (this.detected) {
result = Encoding.convert(exec(command), {
from: this.detected,
type: 'string'
})
} else {
result = exec(command)
}
return this.resultParser(result)
}
kanjiToKana(str, noAutoCache) {
str = str.trim()
noAutoCache = noAutoCache || this.noAutoCache
const cached = this.cache[str]
if (cached !== undefined) {
return cached
} else {
const result = hiraToKana(this.execCommand(str))
if (!noAutoCache) {
this.cache[str] = result
}
return result
}
}
preprocess(list) {
return list.map(s => s.trim())
}
createCache(list) {
list = this.preprocess(list)
const yomiAll = this.kanjiToKana(list.join(this.splitter), true).split(this.splitter)
console.log(yomiAll)
list.forEach((str, i) => {
this.cache[str] = yomiAll[i]
})
}
clearCache() {
this.cache = {}
}
get(list) {
if (!this.noAutoCache && list) {
this.createCache(list)
}
return (a, b) => compareKana(this.kanjiToKana(a), this.kanjiToKana(b))
}
}
module.exports = JapaneseComparator
@sujoyu
Copy link
Author

sujoyu commented Oct 30, 2019

String#localeCompare() は概ね辞書順に動きますが、濁音半濁音を無視します。

これを補完し、漢字の読みにも対応しました。

必須環境

  • Node.js
  • MeCab
$ npm install is-windows encoding-japanese --save

使い方

const Comparator = require('./japanese-comparator')

const list = ['b', 'a', '2', '1', '+', '-', '-2', 'あがき', '悪', '赤城']
// get関数は引数なしでも動作しますが、リストを渡してキャッシュを作ったほうが早くなります
const compare = new Comparator().get(list)
console.log(list.sort(compare))
/*
[
  '-', '-2', '+',
  '1', '2',  'a',
  'b', '赤城', 'あがき',
  '悪'
]
*/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment