gfortaine/@ IndexedDB Full Text Search.md

## @ IndexedDB Full Text Search.md

      
    Raw
  

              @ IndexedDB Full Text Search.md
            
          
    This demonstrates the implementation of full text search for documents in Indexed DB.

Word-breaking and stemming is used to create a list of terms for each document.
Document records are annotated with the list of terms when added to the database.
A multi-entry index on the list of terms is populated.
A query is similarly processed into a list of terms.
A join over the terms is implemented using multiple cursors on the index.

The necessity of annotating records with the word list to populate the index is a
limitation of the current Indexed DB API. A feature request to support custom
indexing is tracked at w3c/IndexedDB#33.
Segmenter


Polyfill of Intl.Segmenter proposal by @littledan
https://github.com/littledan/Segmenter/blob/master/README.md - proposal
https://gist.github.com/inexorabletash/8c4d869a584bcaa18514729332300356 - polyfill

This uses Intl.v8BreakIterator in Chrome (which in turn uses ICU),
and falls back to a terrible English-only implementation elsewhere.
Drop this in as segment.js
Stemmer


Porter Stemming Algorithm by Martin Porter
https://tartarus.org/martin/PorterStemmer/ - algorithm
https://tartarus.org/martin/PorterStemmer/js.txt - JS implementation

Note that this stemmer is no longer recommended by the author for
practical work, but used as it's something everyone has heard of.
Drop this in as porter-stemmer.js
FullText

FullText.tokenize(text, locale)
Tokenize a string into word stems, for creating full text index.

text: string to tokenize
locale: locale for tokenizing (e.g. 'en')

Returns array of word-stems.
FullText.search(index, query, locale, callback)
Perform a full-text search.

index: an IDBIndex mapping word-stems to records
query: text string, e.g. 'alice bob eve'
locale: locale for tokenizing query (e.g. 'en')
callback: called with array of primary keys

Must be called when the index's transaction is active. Callback
will be called when the transaction is active (i.e. more requests
can be made within the transaction).
Throws if query contains no words.

  
## example.html
<!DOCTYPE html>
<script src="porter-stemmer.js"></script>
<script src="segment.js"></script>
<script src="fulltext.js"></script>
<script>

const doc1 = `You already know all the details, but here’s the official word from Yahoo on its $1.1 billion Tumblr deal`;
const doc2 = `Yahoo! Inc. (NASDAQ: YHOO) and Tumblr announced today that they have reached a definitive agreement for Yahoo! to acquire Tumblr.`;
const doc3 = `Of all the things 26-year-old David Karp has done in life, creating Tumblr stands as his most profitable venture, thus far.`

indexedDB.deleteDatabase('db-fulltext');
const request = indexedDB.open('db-fulltext');
request.onupgradeneeded = e => {
  const db = request.result;
  const store = db.createObjectStore('documents', {keyPath: 'docid'});
  store.createIndex('fulltext', 'terms', {multiEntry: true});

  store.put({docid: 1, text: doc1, terms: FullText.tokenize(doc1, 'en')});
  store.put({docid: 2, text: doc2, terms: FullText.tokenize(doc2, 'en')});
  store.put({docid: 3, text: doc3, terms: FullText.tokenize(doc3, 'en')});
};
request.onsuccess = e => {
  const db = request.result;
  const tx = db.transaction('documents');
  const index = tx.objectStore('documents').index('fulltext');

  [
    'yahoo',
    'tumblr',
    'Karp',
    'yahoo tumblr'
  ].forEach(query => {
    FullText.search(index, query, 'en', ids =>
      console.log('query:', JSON.stringify(query), 'results:', ids));
  });
};
</script>

## fulltext.js
/*global stemmer*/

self.FullText = (() => {

  function tokenize(text, locale) {
    const words = new Set();
    const segmenter = Intl.Segmenter(locale, {type: 'word'});
    const iterator = segmenter.segment(text);
    let pos = iterator.index();
    for (let o of iterator) {
      if (o.breakType !== 'none') {
        let word = text.slice(pos, o.index);
        word = word.toLowerCase();
        word = stemmer(word);
        words.add(word);
      }
      pos = o.index;
    }
    return Array.from(words);
  }

  function search(index, query, locale, callback) {
    const results = [];

    const terms = tokenize(query, locale);
    if (terms.length === 0)
      throw new Error('no words in query');

    // Open a cursor for each term.
    let expect = 0;
    const requests = terms.map(term => index.openKeyCursor(term));
    requests.forEach(request => {
      ++expect;
      request.onsuccess = () => {
        if (--expect === 0)
          barrier();
      };
    });

    function barrier() {
      let cursors = requests.map(r => r.result);

      // If any cursor has reached end-of-range, we're done.
      if (cursors.includes(null)) {
        callback(results);
        return;
      }

      // Order cursors lowest/highest by primary key.
      cursors.sort((a, b) => indexedDB.cmp(a.primaryKey, b.primaryKey));

      // All equal? (lowest == highest)
      if (indexedDB.cmp(cursors[0].primaryKey,
                        cursors[cursors.length - 1].primaryKey) === 0) {
        // Yes - we have a match. Record it and advance all.
        results.push(cursors[0].primaryKey);
        expect = cursors.length;
        cursors.forEach(cursor => cursor.continue());
      } else {
        // No - advance lowest cursor.
        expect = 1;
        cursors[0].continue();
      }
    }
  }

  return {
    tokenize: tokenize,
    search: search
  };

})();
	<!DOCTYPE html>
	<script src="porter-stemmer.js"></script>
	<script src="segment.js"></script>
	<script src="fulltext.js"></script>
	<script>

	const doc1 = `You already know all the details, but here’s the official word from Yahoo on its $1.1 billion Tumblr deal`;
	const doc2 = `Yahoo! Inc. (NASDAQ: YHOO) and Tumblr announced today that they have reached a definitive agreement for Yahoo! to acquire Tumblr.`;
	const doc3 = `Of all the things 26-year-old David Karp has done in life, creating Tumblr stands as his most profitable venture, thus far.`

	indexedDB.deleteDatabase('db-fulltext');
	const request = indexedDB.open('db-fulltext');
	request.onupgradeneeded = e => {
	const db = request.result;
	const store = db.createObjectStore('documents', {keyPath: 'docid'});
	store.createIndex('fulltext', 'terms', {multiEntry: true});

	store.put({docid: 1, text: doc1, terms: FullText.tokenize(doc1, 'en')});
	store.put({docid: 2, text: doc2, terms: FullText.tokenize(doc2, 'en')});
	store.put({docid: 3, text: doc3, terms: FullText.tokenize(doc3, 'en')});
	};
	request.onsuccess = e => {
	const db = request.result;
	const tx = db.transaction('documents');
	const index = tx.objectStore('documents').index('fulltext');

	[
	'yahoo',
	'tumblr',
	'Karp',
	'yahoo tumblr'
	].forEach(query => {
	FullText.search(index, query, 'en', ids =>
	console.log('query:', JSON.stringify(query), 'results:', ids));
	});
	};
	</script>
	/global stemmer/

	self.FullText = (() => {

	function tokenize(text, locale) {
	const words = new Set();
	const segmenter = Intl.Segmenter(locale, {type: 'word'});
	const iterator = segmenter.segment(text);
	let pos = iterator.index();
	for (let o of iterator) {
	if (o.breakType !== 'none') {
	let word = text.slice(pos, o.index);
	word = word.toLowerCase();
	word = stemmer(word);
	words.add(word);
	}
	pos = o.index;
	}
	return Array.from(words);
	}

	function search(index, query, locale, callback) {
	const results = [];

	const terms = tokenize(query, locale);
	if (terms.length === 0)
	throw new Error('no words in query');

	// Open a cursor for each term.
	let expect = 0;
	const requests = terms.map(term => index.openKeyCursor(term));
	requests.forEach(request => {
	++expect;
	request.onsuccess = () => {
	if (--expect === 0)
	barrier();
	};
	});

	function barrier() {
	let cursors = requests.map(r => r.result);

	// If any cursor has reached end-of-range, we're done.
	if (cursors.includes(null)) {
	callback(results);
	return;
	}

	// Order cursors lowest/highest by primary key.
	cursors.sort((a, b) => indexedDB.cmp(a.primaryKey, b.primaryKey));

	// All equal? (lowest == highest)
	if (indexedDB.cmp(cursors[0].primaryKey,
	cursors[cursors.length - 1].primaryKey) === 0) {
	// Yes - we have a match. Record it and advance all.
	results.push(cursors[0].primaryKey);
	expect = cursors.length;
	cursors.forEach(cursor => cursor.continue());
	} else {
	// No - advance lowest cursor.
	expect = 1;
	cursors[0].continue();
	}
	}
	}

	return {
	tokenize: tokenize,
	search: search
	};

	})();