poltak/setup.js

## setup.js
// Here's an example of a doc which would be indexed in our program; all fields will be defined (arrays may be empty)
// Note the `id`, `visits`, and `bookmarks` fields contain IDs taken from our PouchDB DB, that's why they look a bit funny.
// The main thing to note about them is the second part is an epoch timestamp (in general: `:type/:timestamp/:nonce`).
// PouchDB/CouchDB indexes docs via IDs, hence it's common to derive them from some meaningful data too afford log-time lookups without much effort.
const exampleDoc = {
  id: 'page/1504617273751/8255568509', // `_id` of a "PouchDB" doc from which the main `content` field is derived from
  content: 'Lots and lots of text, taken from `document.innerHTML` of webpages, with some HTML and text preprocessing done in attempt to slim it down a bit',
  url: 'gitter.im/fergiemcdowall/search-index', // protocol and 'www.' preprocessed out, to afford weighted query predicate on `domain.tld` patterns
  visits: ['visit/1504617273700/5363014742', 'visit/1504617273700/584123123'], // more PouchDB doc `_id`s, associated with the main doc (`id` field)
  bookmarks: [], // pretty much the same as above array, just different conceptually
}

// Here's the simplest query I can come up with that returns results with `score` being `NaN`
// rough translation: "get all indexed web pages that have a visit before time 1504618011822"
const querySimple = [
  {
   "AND": {
     "content": [
       "*"
     ],
     "visits": [
       {
         "lte": "visit/1504618011822",
         "gte": "visit/"
       }
     ]
   },
   "BOOST": 0
  }
]

// A more realistic query from general flow of our program including a filter that will result in all result `score` fields being `NaN`
// rough translation: "get all indexed web pages that have a visit AND bookmark before time 1504618011822 AND contain the term 'javascript' in page content"
const queryRealistic = [
  {
    "AND": {
      "content": [
        "javascript"
      ],
      "bookmarks": [
        {
          "lte": "bookmark/1504618011822",
          "gte": "bookmark/"
        }
      ],
      "url": []
    },
    "BOOST": 0
  },
  {
    "AND": {
      "content": [
        "javascript"
      ],
      "visits": [
        {
          "lte": "visit/1504618011822",
          "gte": "visit/"
        }
      ],
      "url": []
    },
    "BOOST": 0
  }
]

// Here's an example `scoringCriteria` object from a search result with `score` set as `NaN`:
const naughtyScoringCriteria = [
  {
    "tf": {
      "content￮*": 1
    },
    "df": [
      {
        "gte": "content￮*",
        "lte": "content￮*",
        "tf": 201,
        "setLength": 201
      },
      {
        "gte": "visits￮visit/",
        "lte": "visits￮visit/1504618011822",
        "tf": 221,
        "setLength": 9512
      }
    ],
    "tfidf": {
      "content￮*": 60.50702912846022,
      "visits￮visit/": null  // Should this be null???
    },
    "boost": 0,
    "score": null
  }
]


// Here's the index settings + field settings for reference (note: running solely in the browser):
const indexOpts = {
    batchSize: 500,
    appendOnly: true,
    indexPath: 'worldbrain-index',
    logLevel: 'info',
    preserveCase: false,
    compositeField: false,
    nGramLength: 1,
    separator: /[|' .,\-|(\n)]+/,
    stopwords: stopword.en,    // eklem's handy `stopword` package
    fieldOptions: {
        visits: {
            fieldedSearch: true,
        },
        bookmarks: {
            fieldedSearch: true,
        },
        content: {
            fieldedSearch: true,
        },
        url: {
            weight: 10,
            fieldedSearch: true,
            separator: '/',
        },
    },
}
	// Here's an example of a doc which would be indexed in our program; all fields will be defined (arrays may be empty)
	// Note the `id`, `visits`, and `bookmarks` fields contain IDs taken from our PouchDB DB, that's why they look a bit funny.
	// The main thing to note about them is the second part is an epoch timestamp (in general: `:type/:timestamp/:nonce`).
	// PouchDB/CouchDB indexes docs via IDs, hence it's common to derive them from some meaningful data too afford log-time lookups without much effort.
	const exampleDoc = {
	id: 'page/1504617273751/8255568509', // `_id` of a "PouchDB" doc from which the main `content` field is derived from
	content: 'Lots and lots of text, taken from `document.innerHTML` of webpages, with some HTML and text preprocessing done in attempt to slim it down a bit',
	url: 'gitter.im/fergiemcdowall/search-index', // protocol and 'www.' preprocessed out, to afford weighted query predicate on `domain.tld` patterns
	visits: ['visit/1504617273700/5363014742', 'visit/1504617273700/584123123'], // more PouchDB doc `_id`s, associated with the main doc (`id` field)
	bookmarks: [], // pretty much the same as above array, just different conceptually
	}

	// Here's the simplest query I can come up with that returns results with `score` being `NaN`
	// rough translation: "get all indexed web pages that have a visit before time 1504618011822"
	const querySimple = [
	{
	"AND": {
	"content": [
	"*"
	],
	"visits": [
	{
	"lte": "visit/1504618011822",
	"gte": "visit/"
	}
	]
	},
	"BOOST": 0
	}
	]

	// A more realistic query from general flow of our program including a filter that will result in all result `score` fields being `NaN`
	// rough translation: "get all indexed web pages that have a visit AND bookmark before time 1504618011822 AND contain the term 'javascript' in page content"
	const queryRealistic = [
	{
	"AND": {
	"content": [
	"javascript"
	],
	"bookmarks": [
	{
	"lte": "bookmark/1504618011822",
	"gte": "bookmark/"
	}
	],
	"url": []
	},
	"BOOST": 0
	},
	{
	"AND": {
	"content": [
	"javascript"
	],
	"visits": [
	{
	"lte": "visit/1504618011822",
	"gte": "visit/"
	}
	],
	"url": []
	},
	"BOOST": 0
	}
	]

	// Here's an example `scoringCriteria` object from a search result with `score` set as `NaN`:
	const naughtyScoringCriteria = [
	{
	"tf": {
	"content￮*": 1
	},
	"df": [
	{
	"gte": "content￮*",
	"lte": "content￮*",
	"tf": 201,
	"setLength": 201
	},
	{
	"gte": "visits￮visit/",
	"lte": "visits￮visit/1504618011822",
	"tf": 221,
	"setLength": 9512
	}
	],
	"tfidf": {
	"content￮*": 60.50702912846022,
	"visits￮visit/": null // Should this be null???
	},
	"boost": 0,
	"score": null
	}
	]


	// Here's the index settings + field settings for reference (note: running solely in the browser):
	const indexOpts = {
	batchSize: 500,
	appendOnly: true,
	indexPath: 'worldbrain-index',
	logLevel: 'info',
	preserveCase: false,
	compositeField: false,
	nGramLength: 1,
	separator: /[\|' .,\-\|(\n)]+/,
	stopwords: stopword.en, // eklem's handy `stopword` package
	fieldOptions: {
	visits: {
	fieldedSearch: true,
	},
	bookmarks: {
	fieldedSearch: true,
	},
	content: {
	fieldedSearch: true,
	},
	url: {
	weight: 10,
	fieldedSearch: true,
	separator: '/',
	},
	},
	}