Skip to content

Instantly share code, notes, and snippets.

@eloytoro
Last active September 11, 2021 17:12
Show Gist options
  • Save eloytoro/47dc38085889629488ac to your computer and use it in GitHub Desktop.
Save eloytoro/47dc38085889629488ac to your computer and use it in GitHub Desktop.
/**
* WordCounter.js
* a command-line tool by eloytoro
*
* How to use:
*
* $ node word-counter.js <dir>
* - Count the word occurrences among all the files within the given directory directory.
* - It will split the task between 3 workers, each of which could run completely standalone
* by communicating to the main server using TCP sockets if it exposes its address publicly
*
* Install:
* You will require
* - nodejs 4 or higher
* I reccomend you to install node using [NVM](https://github.com/creationix/nvm) by running the following commands on linux
* $ curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.30.1/install.sh | bash
* $ source ~/.profile
* $ nvm install 4
* $ nvm use 4
* Download this script
* - using curl:
* $ curl -O https://gist.githubusercontent.com/eloytoro/47dc38085889629488ac/raw/c4f277f4bf5cb753830c562a05864ec9e7b53769/word-counter.js
* - using wget:
* $ wget https://gist.githubusercontent.com/eloytoro/47dc38085889629488ac/raw/c4f277f4bf5cb753830c562a05864ec9e7b53769/word-counter.js
*/
var cluster = require('cluster');
var net = require('net');
var HOST = '127.0.0.1';
var FILE_TRANSFER_PORT = 9121;
var EMIT_PORT = 9122;
var WORKER_AMOUNT = 3;
// This process forks into multiple workers
if (cluster.isMaster) {
var fs = require('fs');
var path = require('path');
// If no argument is passed it defaults to the current dir
var dir = path.resolve(process.argv[2] || '.');
var files = [];
// The FileTransferServer takes care of sending text files to workers
var ftServer;
// the ReduceServer receives the processing done by the workers
var reduceServer;
var pendingResponses = 0;
var wordMap = {};
var now = Date.now();
/**
* Returns an object containing the top 10 highest values
* @param {object} obj - the object containing a list of {'word': 'count'} entries
* @returns {object}
*/
function takeTopTen (obj) {
return Object.keys(obj)
.sort(function (a, b) {
if (obj[a] < obj[b]) { return 1 }
if (obj[a] > obj[b]) { return -1 };
return 0;
})
.slice(0, 10)
.map(function (item) {
return {
word: item,
count: obj[item]
};
});
};
cluster.on('exit', function (worker, code, signal) {
console.log(`Worker pid:${worker.process.pid} finished`);
});
/**
* Pipes the next file reading operation though a socket. Afterwards the socket will close.
* @param {socket} the socket used to transfer the file through
*/
function pipeFile (socket) {
var file = files.pop();
// If there are no files left in the list the server shouldnt receive
// any more requests
if (!files.length) {
ftServer.close();
}
pendingResponses++;
var stream = fs.createReadStream(path.resolve(dir, file));
stream.pipe(socket);
};
/**
* Callback that handles the socket connection event
* @param {socket} the TCP socket that connected to the server
*/
function handleConnection(socket) {
// listens for data events transmitted by client sockets
socket.on('data', function (data) {
// Splits the received data into all the messages contained into it
var jsons = data.toString('utf8').match(/\{[^\}]*\}/g);
if (!jsons) {
return console.log('not a valid text file');
}
pendingResponses -= jsons.length;
if (pendingResponses === 0 && !files.length) {
reduceServer.close();
}
jsons.forEach((data) => {
var results = JSON.parse(data);
// The word count result from the workers should reduce into
// the global word count variable
for (var key in results) {
wordMap[key] = (wordMap[key] || 0) + results[key];
}
})
});
};
/**
* Listens for the server closed event, it fires when the server is declared closed (refuses any more conenctions)
* and all sockets connected to the server are closed as well
*/
function handleClose() {
// Takes all the results piled up in the `wordMap` variable and takes the top 10 with highes occurrences
takeTopTen(wordMap).map(function (item) {
console.log(`the word ${item.word} has ${item.count} occurrences`);
});
console.log(`process completed in ${Date.now() - now}ms`)
process.exit();
};
/**
* Listens the server ready event, fired when the main server is ready to receive client-side socket connections
*/
function handleListening() {
// when the server starts listening we spawn the workers
for (var i = 0; i < WORKER_AMOUNT; i++) {
cluster.fork();
}
};
function handleError (err) {
console.log('error', err);
};
/**
* Helper function used to read directories recursively in the filesystem
* @param {string} dir - the directory to read files from
* @returns {Array<string>} the list of files found within the directory
*/
function readdirRecursive (dir) {
return fs.readdirSync(dir)
.map(file => path.resolve(dir, file))
.map((file) => {
var stat = fs.statSync(file);
if (stat && stat.isDirectory()) {
return readdirRecursive(file);
} else {
return file;
}
})
.reduce((acc, value) => {
if (Array.isArray(value)) {
acc = acc.concat(value);
} else {
acc.push(value);
}
return acc;
}, []);
};
files = readdirRecursive(dir);
// fires up the main server
ftServer = net.createServer(function (socket) {
// when a file transfer socket connects to the server we send it a file
// at once
pipeFile(socket);
});
ftServer.on('error', handleError);
ftServer.on('listening', handleListening);
ftServer.listen(FILE_TRANSFER_PORT, HOST);
reduceServer = net.createServer(handleConnection);
reduceServer.on('close', handleClose);
reduceServer.on('error', handleError);
reduceServer.listen(EMIT_PORT, HOST)
} else {
// workers spawn a socket they use to send results and many other
// file transfer sockets to receive files
var client = new net.Socket();
// Stablishes a connection to the main server
client.connect(EMIT_PORT, HOST, function () {
console.log(`Worker pid:${process.pid} connected`);
openFileTransfer();
});
/**
* splits the text into words and counts them
* Ex: `'TREE tree house' => {tree: 2, house: 1}`
* @param {string} text - the contents of the file
* @returns {object} an object that contains a set of keys that match words and its values are the number of occurrences
*/
var doProcess = function (text) {
return text.toLowerCase()
.match(/[a-z0-9]+/gi)
.reduce(function (acc, word) {
acc[word] = acc[word] ? acc[word] + 1 : 1;
return acc;
}, {});
};
client.on('end', function () {
console.log(`Worker pid:${process.pid} disconnected`);
});
client.on('error', function () {
});
/**
* Does the word counting process and then emits the results back to the main server
* @param {string} text - the contents of the file
*/
function emit(text) {
var result = doProcess(text);
client.write(JSON.stringify(result), openFileTransfer);
}
/**
* Spawns a new file transfer socket that will handle a single incoming file from the server
*/
function openFileTransfer() {
var ft = new net.Socket();
// Text file chunks that are buffered into the socket will be piled up here
var packets = [];
// Stablishes the connection to the main server
ft.connect(FILE_TRANSFER_PORT, HOST, function () {
// console.log(`File transfer pid:${process.pid} connected`);
});
// Listens for incoming data, mostly chunks of the text file to be processed
ft.on('data', function (chunk) {
// saves the sent chunk to memory
packets.push(chunk);
});
ft.on('error', function (err) {
client.end();
})
// Listens to the end of transfer event, triggered when the file transfer completes
ft.on('end', function () {
if (!packets.length) return;
// console.log(`File transfer pid:${process.pid} successful`);
var text = Buffer.concat(packets).toString('utf8');
packets = [];
emit(text);
});
};
};
@eloytoro
Copy link
Author

eloytoro commented Dec 3, 2015

Readme

requires node 4.x or later
How to run it
node word-counter.js path/to/directory
the script will perform a deep search in the given directory (defaults to ./) for files and will proceed to count the occurrences of each word written across all files

node word-counter.js ~/.nvm/versions/node/v4.1.0/share/man/
Worker pid:3329 connected
Worker pid:3329 disconnected
Worker pid:3324 connected
Worker pid:3324 disconnected
the word default has 350 occurrences
the word type has 345 occurrences
the word bool has 282 occurrences
the word false has 174 occurrences
the word trace has 130 occurrences
the word true has 108 occurrences
the word in has 91 occurrences
the word the has 86 occurrences
the word of has 86 occurrences
the word code has 82 occurrences
process completed in 238ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment