Created
August 13, 2017 16:05
-
-
Save judsonbsilva/d3824e4da0bacebe4a5c01bffb049498 to your computer and use it in GitHub Desktop.
Testing methods to data mining in big texts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const text = "4.1. EDITAL - CORREGEDORIA GERAL DA JUSTIÇA\nAVISO DE INTIMAÇÃO (Corregedoria Geral da Justiça de TERESINA)\nProcesso nº 0001107-87.2014.8.18.0139\nClasse: Reclamação Disciplinar\nRequerente: IGREJA ASSEMBLEIA DE DEUS DO DIRCEU DOIS\nAdvogado(s):\nRequerido: JOSÉ RAMOS DIAS FILHO, JUIZ DE DIREITO DA 2ª VARA CÍVEL DA COMARCA DE TERESINA-PI\nAdvogado(s): MOISÉS ÂNGELO DE MOURA REIS (OAB/PI Nº 874/75)\nDECISÃO: Em face do exposto,determino o arquivamento do presente expediente. Comunique-se à Corregedoria Nacional de Justiça, no prazo\nde quinze dias,sobre a presente decisão. Intimações necessárias. Teresina-PI, 19 de junho de 2017. Des. Brandão de Carvalho, Decano.\n5. FERMOJUPI/SECOF"; | |
class NLPJS { | |
static regex = { | |
numericTitle: /^(\d+\.)+\s[A-Z]+/ | |
} | |
static _cleanText(text){ | |
return text.replace(/\s{2,}/, ' '); | |
} | |
static _concatLines(lines, fromLine, toLine){ | |
var text = ''; | |
for(var i = fromLine; i <= toLine; i++) | |
text += lines[i]; | |
return text; | |
} | |
static getBlock(text, blockLimits, callback){ | |
text = NLPJS._cleanText(text); | |
const lines = text.split('\n'); | |
const initBlock = blockLimits[0]; | |
const finishBlock = blockLimits[1]; | |
var begins = []; | |
var ends = []; | |
for(var indexLine = 0; indexLine < lines.length; indexLine++){ | |
let line = lines[indexLine]; | |
if( begins.length > ends.length && finishBlock.test( line ) ) | |
ends.push(indexLine); | |
if( initBlock.test( line ) ) | |
begins.push(indexLine); | |
} | |
for(var i = 0; i < ends.length; i++){ | |
let initIndex = begins[i]; | |
let finishIndex = ends[i]; | |
var text = NLPJS._concatLines(lines, initIndex, finishIndex); | |
callback && callback( text ); | |
} | |
} | |
static getTerms(text, termList, callback){ | |
text = NLPJS._cleanText(text); | |
var words = text.split(' '); | |
for(var indexWord = 0; indexWord < words.length; indexWord++){ | |
var word = words[indexWord]; | |
for(var indexTerm in termList){ | |
var termGetter = termList[indexTerm]; | |
var regexTerm = termGetter[0]; | |
var beforeIndex = termGetter[1]; | |
var afterIndex = termGetter[2]; | |
var callbackTerm = termGetter[3]; | |
var temp = indexWord - beforeIndex; | |
beforeIndex = temp < 0 ? 0: temp; | |
temp = indexWord + afterIndex; | |
afterIndex = temp < words.length? temp: words.length; | |
if( regexTerm.test(word) ){ | |
var matchs = []; | |
for(var i = beforeIndex; i <= afterIndex; i++){ | |
matchs.push( words[i] ); | |
} | |
// If callback exists call it | |
callbackTerm && callbackTerm(matchs); | |
callback && callback(matchs); | |
} | |
} | |
} | |
} | |
}; | |
NLPJS.getBlock(text, [NLPJS.regex.numericTitle, NLPJS.regex.numericTitle], function(block){ | |
NLPJS.getTerms(block, [[/oab/i, 5, 4]], function(pah2){ | |
console.log('Geral', pah2); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment