Skip to content

Instantly share code, notes, and snippets.

@zsxsoft
Created May 5, 2017 08:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zsxsoft/95e7227a40d728a2b93d88549e8d6be1 to your computer and use it in GitHub Desktop.
Save zsxsoft/95e7227a40d728a2b93d88549e8d6be1 to your computer and use it in GitHub Desktop.
qcdata
const nodejieba = require("nodejieba");
let map = new Map();
const cheerio = require('cheerio')
const fs = require('fs')
const walk = require('walk')
const path = require('path')
const walker = walk.walk('.\\data', {followLinks: false})
const texts = []
walker.on('file', (root, fileStat, next) => {
fs.readFile(path.resolve(root, fileStat.name), 'utf-8', (err, data) => {
data.split('\n').forEach(w => {
const f = w.split('||||||')
const m = nodejieba.cut(f[0], true);
m.forEach(mm => {
if (!map.has(mm)) {
map.set(mm, 0)
}
map.set(mm, map.get(mm) + Math.floor(f[1]))
})
})
//texts.push(data)
/*console.log(path.resolve(root, fileStat.name))
const files = data.split('\n')
files.forEach(file => {
const result = nodejieba.cut(file)
console.log(result)
})*/
// console.log(result);
next();
});
})
walker.on("end", () => {
let ret = []
map.forEach((v, key) => ret.push([key, map.get(key)]))
ret.sort((a, b) => - (Math.floor(a[1]) - Math.floor(b[1])))
fs.writeFileSync('3.txt', (ret.filter(p => /[\u4e00-\u9fa5]/.test(p[0])).filter(p => p[0].length > 1).map(p => `Word = ${p[0]}, Weight = ${p[1]}`).join('\n')), 'utf-8');
/*
map.forEach((value, key) => {
console.log(value)
})*/
// fs.writeFileSync('3.txt', (nodejieba.extract(texts.join('\n'), 1000).filter(p => /[\u4e00-\u9fa5]/.test(p.word)).map(p => `Word = ${p.word}, Weight = ${p.weight}`).join('\n')), 'utf-8');
})
/* result = nodejieba.cut("南京市长江大桥");
console.log(result);*/
{
"dependencies": {
"cheerio": "^0.22.0",
"nodejieba": "^2.2.4",
"walk": "^2.3.9"
}
}
const cheerio = require('cheerio')
const fs = require('fs')
const walk = require('walk')
const path = require('path')
const walker = walk.walk('.\\raw', {followLinks: false})
walker.on('file', (root, fileStat, next) => {
fs.readFile(path.resolve(root, fileStat.name), (err, buffer) => {
console.log(path.resolve(root, fileStat.name))
const $ = cheerio.load(buffer.toString(), {decodeEntities: false})
const ret = []
$('.results tr').each(function(index, child) {
const w = $(this).find('td:nth-child(6)').html()
if (w) {
const a = $(this).find('td:nth-child(3)').html()
const b = $(this).find('td:nth-child(4)').html()
ret.push([w.split("<br>")[0], "||||||", a * b].join(""))
}
})
fs.writeFileSync(path.resolve(root, fileStat.name).replace("raw", "data"), ret.join("\n"), "utf-8")
next();
});
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment