Skip to content

Instantly share code, notes, and snippets.

@judsonbsilva
Created August 13, 2017 16:05
Show Gist options
  • Save judsonbsilva/d3824e4da0bacebe4a5c01bffb049498 to your computer and use it in GitHub Desktop.
Save judsonbsilva/d3824e4da0bacebe4a5c01bffb049498 to your computer and use it in GitHub Desktop.
Testing methods to data mining in big texts
const text = "4.1. EDITAL - CORREGEDORIA GERAL DA JUSTIÇA\nAVISO DE INTIMAÇÃO (Corregedoria Geral da Justiça de TERESINA)\nProcesso nº 0001107-87.2014.8.18.0139\nClasse: Reclamação Disciplinar\nRequerente: IGREJA ASSEMBLEIA DE DEUS DO DIRCEU DOIS\nAdvogado(s):\nRequerido: JOSÉ RAMOS DIAS FILHO, JUIZ DE DIREITO DA 2ª VARA CÍVEL DA COMARCA DE TERESINA-PI\nAdvogado(s): MOISÉS ÂNGELO DE MOURA REIS (OAB/PI Nº 874/75)\nDECISÃO: Em face do exposto,determino o arquivamento do presente expediente. Comunique-se à Corregedoria Nacional de Justiça, no prazo\nde quinze dias,sobre a presente decisão. Intimações necessárias. Teresina-PI, 19 de junho de 2017. Des. Brandão de Carvalho, Decano.\n5. FERMOJUPI/SECOF";
class NLPJS {
static regex = {
numericTitle: /^(\d+\.)+\s[A-Z]+/
}
static _cleanText(text){
return text.replace(/\s{2,}/, ' ');
}
static _concatLines(lines, fromLine, toLine){
var text = '';
for(var i = fromLine; i <= toLine; i++)
text += lines[i];
return text;
}
static getBlock(text, blockLimits, callback){
text = NLPJS._cleanText(text);
const lines = text.split('\n');
const initBlock = blockLimits[0];
const finishBlock = blockLimits[1];
var begins = [];
var ends = [];
for(var indexLine = 0; indexLine < lines.length; indexLine++){
let line = lines[indexLine];
if( begins.length > ends.length && finishBlock.test( line ) )
ends.push(indexLine);
if( initBlock.test( line ) )
begins.push(indexLine);
}
for(var i = 0; i < ends.length; i++){
let initIndex = begins[i];
let finishIndex = ends[i];
var text = NLPJS._concatLines(lines, initIndex, finishIndex);
callback && callback( text );
}
}
static getTerms(text, termList, callback){
text = NLPJS._cleanText(text);
var words = text.split(' ');
for(var indexWord = 0; indexWord < words.length; indexWord++){
var word = words[indexWord];
for(var indexTerm in termList){
var termGetter = termList[indexTerm];
var regexTerm = termGetter[0];
var beforeIndex = termGetter[1];
var afterIndex = termGetter[2];
var callbackTerm = termGetter[3];
var temp = indexWord - beforeIndex;
beforeIndex = temp < 0 ? 0: temp;
temp = indexWord + afterIndex;
afterIndex = temp < words.length? temp: words.length;
if( regexTerm.test(word) ){
var matchs = [];
for(var i = beforeIndex; i <= afterIndex; i++){
matchs.push( words[i] );
}
// If callback exists call it
callbackTerm && callbackTerm(matchs);
callback && callback(matchs);
}
}
}
}
};
NLPJS.getBlock(text, [NLPJS.regex.numericTitle, NLPJS.regex.numericTitle], function(block){
NLPJS.getTerms(block, [[/oab/i, 5, 4]], function(pah2){
console.log('Geral', pah2);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment