Created
September 5, 2017 14:19
-
-
Save poltak/6c8c003ea8a59407be5b929e2002e418 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Here's an example of a doc which would be indexed in our program; all fields will be defined (arrays may be empty) | |
// Note the `id`, `visits`, and `bookmarks` fields contain IDs taken from our PouchDB DB, that's why they look a bit funny. | |
// The main thing to note about them is the second part is an epoch timestamp (in general: `:type/:timestamp/:nonce`). | |
// PouchDB/CouchDB indexes docs via IDs, hence it's common to derive them from some meaningful data too afford log-time lookups without much effort. | |
const exampleDoc = { | |
id: 'page/1504617273751/8255568509', // `_id` of a "PouchDB" doc from which the main `content` field is derived from | |
content: 'Lots and lots of text, taken from `document.innerHTML` of webpages, with some HTML and text preprocessing done in attempt to slim it down a bit', | |
url: 'gitter.im/fergiemcdowall/search-index', // protocol and 'www.' preprocessed out, to afford weighted query predicate on `domain.tld` patterns | |
visits: ['visit/1504617273700/5363014742', 'visit/1504617273700/584123123'], // more PouchDB doc `_id`s, associated with the main doc (`id` field) | |
bookmarks: [], // pretty much the same as above array, just different conceptually | |
} | |
// Here's the simplest query I can come up with that returns results with `score` being `NaN` | |
// rough translation: "get all indexed web pages that have a visit before time 1504618011822" | |
const querySimple = [ | |
{ | |
"AND": { | |
"content": [ | |
"*" | |
], | |
"visits": [ | |
{ | |
"lte": "visit/1504618011822", | |
"gte": "visit/" | |
} | |
] | |
}, | |
"BOOST": 0 | |
} | |
] | |
// A more realistic query from general flow of our program including a filter that will result in all result `score` fields being `NaN` | |
// rough translation: "get all indexed web pages that have a visit AND bookmark before time 1504618011822 AND contain the term 'javascript' in page content" | |
const queryRealistic = [ | |
{ | |
"AND": { | |
"content": [ | |
"javascript" | |
], | |
"bookmarks": [ | |
{ | |
"lte": "bookmark/1504618011822", | |
"gte": "bookmark/" | |
} | |
], | |
"url": [] | |
}, | |
"BOOST": 0 | |
}, | |
{ | |
"AND": { | |
"content": [ | |
"javascript" | |
], | |
"visits": [ | |
{ | |
"lte": "visit/1504618011822", | |
"gte": "visit/" | |
} | |
], | |
"url": [] | |
}, | |
"BOOST": 0 | |
} | |
] | |
// Here's an example `scoringCriteria` object from a search result with `score` set as `NaN`: | |
const naughtyScoringCriteria = [ | |
{ | |
"tf": { | |
"content○*": 1 | |
}, | |
"df": [ | |
{ | |
"gte": "content○*", | |
"lte": "content○*", | |
"tf": 201, | |
"setLength": 201 | |
}, | |
{ | |
"gte": "visits○visit/", | |
"lte": "visits○visit/1504618011822", | |
"tf": 221, | |
"setLength": 9512 | |
} | |
], | |
"tfidf": { | |
"content○*": 60.50702912846022, | |
"visits○visit/": null // Should this be null??? | |
}, | |
"boost": 0, | |
"score": null | |
} | |
] | |
// Here's the index settings + field settings for reference (note: running solely in the browser): | |
const indexOpts = { | |
batchSize: 500, | |
appendOnly: true, | |
indexPath: 'worldbrain-index', | |
logLevel: 'info', | |
preserveCase: false, | |
compositeField: false, | |
nGramLength: 1, | |
separator: /[|' .,\-|(\n)]+/, | |
stopwords: stopword.en, // eklem's handy `stopword` package | |
fieldOptions: { | |
visits: { | |
fieldedSearch: true, | |
}, | |
bookmarks: { | |
fieldedSearch: true, | |
}, | |
content: { | |
fieldedSearch: true, | |
}, | |
url: { | |
weight: 10, | |
fieldedSearch: true, | |
separator: '/', | |
}, | |
}, | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment