Created
June 10, 2020 09:05
-
-
Save liudonghua123/e728cd704f17a0a23b799619a86f56b7 to your computer and use it in GitHub Desktop.
sensitive word
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const pinyin = require('pinyin'); | |
const fs = require('fs'); | |
const readFile = (filePath) => { | |
const contents = fs.readFileSync(filePath, 'utf8'); | |
const lines = contents.split('\n'); | |
lines.map((line) => line.trim()); | |
return lines; | |
}; | |
const toPinYin = (words) => { | |
return words.map((item) => | |
pinyin(item, { | |
style: pinyin.STYLE_NORMAL, | |
}) | |
.flat() | |
.join('') | |
); | |
}; | |
(async () => { | |
const dir = 'data/敏感词库'; | |
const files = [ | |
'反动词库.txt', | |
'广告.txt', | |
'政治类.txt', | |
'敏感词.txt', | |
'暴恐词库.txt', | |
'民生词库.txt', | |
'涉枪涉爆违法信息关键词.txt', | |
'色情词库.txt', | |
]; | |
let words = []; | |
for (const fileName of files) { | |
words.push(...readFile(`${dir}/${fileName}`)); | |
} | |
// remove duplicate | |
words = [...new Set(words)]; | |
let results = toPinYin(words); | |
// remove space, new lines | |
results = results.map((item) => item.replace(/\r\n|\n|\r|\s/gm, '')); | |
results = results.map((item) => item.toLowerCase()); | |
// filter by length and can contain only alphabetically and numbers | |
const reg = /^[A-za-z0–9]+$/; | |
results = results.filter((item) => item.length >= 6 && item.length <= 16 && reg.test(item)); | |
// sort by length and then alphabetically | |
results.sort(function (a, b) { | |
return a.length - b.length || a.localeCompare(b); | |
}); | |
// remove duplicate | |
results = [...new Set(results)]; | |
fs.writeFileSync('output.txt', results.join('\n')); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment