Skip to content

Instantly share code, notes, and snippets.

@liudonghua123
Created June 10, 2020 09:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liudonghua123/e728cd704f17a0a23b799619a86f56b7 to your computer and use it in GitHub Desktop.
Save liudonghua123/e728cd704f17a0a23b799619a86f56b7 to your computer and use it in GitHub Desktop.
sensitive word
const pinyin = require('pinyin');
const fs = require('fs');
const readFile = (filePath) => {
const contents = fs.readFileSync(filePath, 'utf8');
const lines = contents.split('\n');
lines.map((line) => line.trim());
return lines;
};
const toPinYin = (words) => {
return words.map((item) =>
pinyin(item, {
style: pinyin.STYLE_NORMAL,
})
.flat()
.join('')
);
};
(async () => {
const dir = 'data/敏感词库';
const files = [
'反动词库.txt',
'广告.txt',
'政治类.txt',
'敏感词.txt',
'暴恐词库.txt',
'民生词库.txt',
'涉枪涉爆违法信息关键词.txt',
'色情词库.txt',
];
let words = [];
for (const fileName of files) {
words.push(...readFile(`${dir}/${fileName}`));
}
// remove duplicate
words = [...new Set(words)];
let results = toPinYin(words);
// remove space, new lines
results = results.map((item) => item.replace(/\r\n|\n|\r|\s/gm, ''));
results = results.map((item) => item.toLowerCase());
// filter by length and can contain only alphabetically and numbers
const reg = /^[A-za-z0–9]+$/;
results = results.filter((item) => item.length >= 6 && item.length <= 16 && reg.test(item));
// sort by length and then alphabetically
results.sort(function (a, b) {
return a.length - b.length || a.localeCompare(b);
});
// remove duplicate
results = [...new Set(results)];
fs.writeFileSync('output.txt', results.join('\n'));
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment