Skip to content

Instantly share code, notes, and snippets.

@mikehenrty
Created November 14, 2017 10:23
Show Gist options
  • Save mikehenrty/802dadf973e42684df12e054e9769417 to your computer and use it in GitHub Desktop.
Save mikehenrty/802dadf973e42684df12e054e9769417 to your computer and use it in GitHub Desktop.
This file parses a common voice data directory and outputs stats about the data.
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const mp3Duration = require('mp3-duration');
const OUTPUT_FOLDER = path.resolve('./output/batch');
const DOWNLOAD_FOLDER = path.resolve('./sample');
const DEFAULT_SALT = '8hd3e8sddFSdfj';
const CONCURRENCY = 50;
const startTime = Date.now();
function hash(str) {
return crypto
.createHmac('sha256', DEFAULT_SALT)
.update(str)
.digest('hex');
}
/**
* Create a function that is the promise version of standard callback apis.
*/
function promisify(ctx, func) {
return function(...args) {
return new Promise((res, rej) => {
args.push((err, result) => {
if (err) {
rej(err);
return;
}
res(result);
});
func.apply(ctx, args);
});
};
}
/**
* Promise versions of fs node standard lib.
*/
const readdirPromise = promisify(fs, fs.readdir);
const readFilePromise = promisify(fs, fs.readFile);
const renamePromise = promisify(fs, fs.rename);
const durationPromise = promisify(this, mp3Duration);
const tracker = {
users: 0,
mp3s: 0,
txts: 0,
votes: 0,
demos: 0,
sentences: 0,
seconds: 0.0,
weirdos: 0,
};
const users = {
// "userid": {
// sentences: number of sentences
// upvotes: number of upvotes
// downvotes: number of downvotes
// }
};
// Key value store for sentences.
const sentences = {
// "setence_hash": {
// text: "sentence_body",
// count: "found x times",
// path: "path_to_file",
// }
};
const clips = {
// "clipKey": {
// "text": "sentence_body"
// "upvotes": yes votes
// "downvotes": down votes
// }
};
async function processText(path) {
const sentence = await readFilePromise(path, 'utf8');
const key = hash(sentence);
if (!sentences[key]) {
sentences[key] = {
text: sentence,
count: 0,
path: path,
}
}
++sentences[key].count;
}
async function processAllText(files) {
let = i = 0;
while (i < files.length) {
let slice = new Array(CONCURRENCY);
for (let j = 0; j < CONCURRENCY; j++) {
let filePath = files[i + j];
if (filePath) {
slice[j] = processText(filePath);
}
}
await Promise.all(slice);
i += CONCURRENCY;
}
}
async function processClip(path) {
const duration = await durationPromise(path);
tracker.seconds += duration;
}
async function processAllMP3s(files) {
let = i = 0;
while (i < files.length) {
let slice = new Array(CONCURRENCY);
for (let j = 0; j < CONCURRENCY; j++) {
let filePath = files[i + j];
if (filePath) {
slice[j] = processClip(filePath);
}
}
await Promise.all(slice);
i += CONCURRENCY;
}
}
async function processVote(path, user, clip) {
const vote = await readFilePromise(path, 'utf8');
if (vote === 'true') {
++user.upvotes;
++clip.upvotes;
} else if (vote === 'false') {
++user.downvotes
++clip.downvotes;
} else {
console.error('unrecognized vote data', vote);
}
}
async function processAllVotes(files) {
let = i = 0;
while (i < files.length) {
let slice = new Array(CONCURRENCY);
for (let j = 0; j < CONCURRENCY; j++) {
let data = files[i + j];
if (data) {
if (!data.clip) {
console.log('wtf', data);
return;
}
slice[j] = processVote(data.path, data.user, data.clip);
}
}
await Promise.all(slice);
i += CONCURRENCY;
}
}
async function move(file, folder) {
}
async function displayVoteMetrics(clips) {
const clipTracker = {
none: 0,
unverified: 0,
verified: 0,
bad: 0,
bothUp: 0,
bothDown: 0,
};
let keys = Object.keys(clips);
for (let i = 0; i < keys.length; i++) {
const key = keys[i];
const clip = clips[key];
let votes = clip.upvotes + clip.downvotes;
if (votes === 0) {
++clipTracker.none;
} else if (votes < 3) {
++clipTracker.unverified;
} else if (clip.upvotes > 1 && clip.downvotes > 1) {
if (clip.upvotes / votes > 1 / 3) {
++clipTracker.bothUp;
await Promise.all([
clip.src && renamePromise(clip.src, OUTPUT_FOLDER + '/' + key + '.mp3'),
clip.txt && renamePromise(clip.txt, OUTPUT_FOLDER + '/' + key + '.txt'),
]);
} else {
++clipTracker.bothDown;
}
} else if (clip.upvotes > 1) {
++clipTracker.verified;
} else if (clip.downvotes > 1) {
++clipTracker.bad;
} else {
console.error('wtf?', clip);
}
}
console.log('tracker', clipTracker);
}
async function displayMetrics() {
let textFiles = [];
let mp3Files = [];
let voteFiles = [];
try {
let folders = await readdirPromise(DOWNLOAD_FOLDER);
for (let i = 0; i < Math.floor(folders.length); i++) {
let folder = folders[i];
// Only use folders.
if (folder.indexOf('.') !== -1) {
++tracker.weirdos;
continue;
}
if (!users[folder]) {
users[folder] = {
sentences: 0,
upvotes: 0,
downvotes: 0,
};
}
const user = users[folder];
++tracker.users;
let folderPath = path.join(DOWNLOAD_FOLDER, folder);
let subfiles = await readdirPromise(folderPath);
for (let j = 0; j < subfiles.length; j++) {
const file = subfiles[j];
const filePath = path.join(folderPath, file);
// Start here!
const dotIndex = file.indexOf('.');
let clipKey = folder + '_' + file.substr(0, dotIndex);
if (dotIndex === -1) {
console.log('found unexpected subfolder', folder, file);
++tracker.weirdos;
continue;
}
const ext = file.substr(dotIndex + 1, file.length - 1);
if (ext === 'vote' || ext === 'txt' || ext === 'mp3') {
if (ext === 'vote') {
clipKey = clipKey.split('-by-')[0];
}
if (!clips[clipKey]) {
clips[clipKey] = {
text: '',
src: '',
txt: '',
user: folder,
upvotes: 0,
downvotes: 0,
};
}
}
let clip = clips[clipKey];
switch (ext) {
case 'txt':
textFiles.push(filePath);
clip.txt = filePath;
//processText(filePath);
++tracker.txts;
++user.sentences;
break;
case 'mp3':
mp3Files.push(filePath);
clip.src = filePath;
//processClip(filePath);
++tracker.mp3s;
break;
case 'vote':
voteFiles.push({
path: filePath,
user: user,
clip: clip,
});
//await processVote(filePath, folder, file);
++tracker.votes;
break;
case 'json':
++tracker.demos;
break;
default:
console.error('unrecognized file', file, ext);
++tracker.weirdos;
break;
}
}
}
console.log('found files', textFiles.length);
//await processAllText(textFiles);
console.log('found mp3s', mp3Files.length);
//await processAllMP3s(mp3Files);
console.log('found votes', voteFiles.length);
await processAllVotes(voteFiles);
tracker.sentences += Object.keys(sentences).length;
console.log(tracker);
const endTime = Date.now();
const elapsed = endTime - startTime;
const minutes = elapsed / 60000;
console.log(`\ncomplete in ${minutes.toFixed(2)} minutes\n`);
displayVoteMetrics(clips);
} catch (err) {
console.error('top level error', err);
}
}
displayMetrics().catch(err => {
console.error('unhandled exception', err);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment