Skip to content

Instantly share code, notes, and snippets.

@luizamboni
Last active September 9, 2017 04:21
Show Gist options
  • Save luizamboni/6f8a1dfe3221934398892d067c453d4c to your computer and use it in GitHub Desktop.
Save luizamboni/6f8a1dfe3221934398892d067c453d4c to your computer and use it in GitHub Desktop.
transform phrase in lucene query with postuguese semantics
const dictionary = {
substantives: [
"berço",
"notebook",
"smartphone",
"tela"
],
attributes: [
"preto",
"preta",
"amarelo",
"redondo",
"redonda"
],
brands: [
"hp",
"samsung",
"lenovo",
"lg",
"quantum"
],
prepositions: [
"de", "da", "do", "para", "com"
],
metrics: [
'"',
"cm",
"\d\s?m",
"metro",
"metros",
"polegadas"
]
}
const prepReg = new RegExp(dictionary.prepositions.join("|"))
const metricsReg = new RegExp(dictionary.metrics.join("|"))
const brandsReg = new RegExp(dictionary.brands.join("|"))
const cache = {}
function isPreposition(subterm) {
const { prepositions } = dictionary
return (prepositions.indexOf(subterm) !== -1 ) ? 1 : 0
}
function isPredicate(subterm){
const { prepositions, predicades } = dictionary
const prepReg = new RegExp(prepositions.map(t => `${t}\\s\\w`).join("|"))
if(prepReg.test(subterm))
return 1
if(isMetric(subterm))
return 1
if(isBrand(subterm))
return 1
if(isAttribute(subterm))
return 1
return 0
}
function isBrand(subterm){
const { brands } = dictionary
return (brands.indexOf(subterm) !== -1 ) ? 1 : 0
}
function isSubstantive(subterm){
const { substantives } = dictionary
return (substantives.indexOf(subterm) !== -1)? 1 : 0
}
function isMetric(subterm) {
const { metrics } = dictionary
return (metricsReg.test(subterm)) ? 1 : 0
}
function isAttribute(subterm) {
const { attributes } = dictionary
return (attributes.indexOf(subterm) !== -1)? 1 : 0
}
function termsSplit(tokens) {
const { prepositions, metrics } = dictionary
return tokens.map((token,i) => {
const { t } = token
// add metadata
token.predicate = isPredicate(t)
token.substantive = isSubstantive(t)
token.brand = isBrand(t)
token.metric = isMetric(t)
token.attribute = isAttribute(t)
return token
})
.map((token,i) => {
const prevToken = tokens[i-1]
const nextToken = tokens[i+1]
const { t: term } = token
if(isMetric(term)){
return
}
if(isAttribute(term)){
return
}
if(isPreposition(term)){
return
}
if(prevToken) {
if(isPreposition(prevToken.t)) {
token.t = [ prevToken.t, term ].join(" ")
token.i = i - 1
}
}
if(nextToken){
if(isMetric(nextToken.t)) {
token.t += " " + nextToken.t ,
token.i--
}
if(isAttribute(nextToken.t)) {
token.t += " " + nextToken.t,
token.i--
}
}
return token
})
.filter(token => token)
}
function lucenize(terms, opts = { name: ""}) {
if(cache[terms])
return cache[terms]
let tokens = terms.toLowerCase().split(/\s+/).map((t,i) => ({ t, i }))
tokens = termsSplit(tokens)
const tokensWeight = tokens.map((token,i) => {
const { substantive, predicate, brand, metric } = token
token.w = (tokens.length - i) + substantive*2 + predicate + brand + metric
return token
})
const lucene = `(${tokensWeight.map(token => `"${token.t}"^${token.w}` ).join(" +")})`
cache[terms] = lucene
return opts.name ?`${opts.name}:${lucene}` : lucene
}
const terms = [
"notebook da hp preto",
"kit berço",
"kit para berço",
"tela 15\"",
"mesa redonda 50x50 cm",
"coisa de 25 cm",
"unknow preto",
"smartphone samsung preto tela de x polegadas"
].forEach(t => {
console.log(lucenize(t, { name: "raw_name"}))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment