Skip to content

Instantly share code, notes, and snippets.

@titomus
Created September 13, 2022 06:05
Show Gist options
  • Save titomus/011453c68c8da6df754ae2393fbe4998 to your computer and use it in GitHub Desktop.
Save titomus/011453c68c8da6df754ae2393fbe4998 to your computer and use it in GitHub Desktop.
Ngram creation with probabilities
function n_grams(tokens,n = 2,sep = ' ',epoch = 3) {
var ngrams = [];
for (let j=0; j < epoch; j++) {
//remove first token
if (j!=0){tokens.shift();}
for (var i = 0; i < tokens.length -n; i+=n) {
let ng1 = tokens.slice(i, i+n).join(sep);
let ng2 = tokens.slice(i+n, i+n+n).join(sep);
let ngram = [ng1.replace(/\s?(START|END)\s?/gm,'')/*.trim()*/, ng2.replace(/\s?(START|END)\s?/gm,'')/*.trim()*/];
// /!\ pas de trim() à cause des tokens sur chars
// console.log(`${i} : ${tokens[i]}`);
if (tokens[i] === 'START') {
ngrams.push(["START", ng1.replace(/(START|END)\s?/g, '')]);
ngrams.push([ng1.replace(/(START|END)\s?/g, ''),ng2]);
} else if (tokens[i+n] === 'START') { // i+n+n est donc START
ngrams.push([ng1, "END"]);
//ngrams.push(["START", ng1.replace(/START\s?/g, '')]);
} else if (i === tokens.length) {
ngrams.push([ng1, "END"]);
} else {
// console.log(`On peut pusher i: ${i} / Token: ${tokens[i]} / Ngram: ${ngram}`);
ngrams.push(ngram);
}
}
console.log(`================================= Epoch ${j} : ${ngrams.length} ngrams`);
}
ngrams = [
// count the number of times each ngram occurs
...ngrams
.map(JSON.stringify)
.reduce((acc, v) => acc.set(v, (acc.get(v) || 0) + 1), new Map())
.entries(),
].map(([k, v]) => JSON.parse(k).concat(v)).sort(function (a, b) {
return b[2] - a[2];
});
//groups
let groups = ngrams.reduce((acc, cur) => {
if (!acc[cur[0]]) {
acc[cur[0]] = [];
}
acc[cur[0]].push([cur[1],cur[2]]);
return acc;
}, {});
// console.log('Groups: ',groups);
//set probabilities
Object.entries(groups).forEach(entry => {
const [key, value] = entry;
//get sum and probabilities
const sum = value.reduce((acc, cur) => acc + cur[1], 0);
const probabilities = value.map(x => [x[0], x[1] / sum]);
groups[key] = probabilities;
});
//console.log('Groups: ',groups);
return groups; //Object.entries(groups);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment