Skip to content

Instantly share code, notes, and snippets.

@tehbeard
Created March 27, 2021 16:29
Show Gist options
  • Save tehbeard/30ed38acf6fc4ba5dd8a7dd771ad1382 to your computer and use it in GitHub Desktop.
Save tehbeard/30ed38acf6fc4ba5dd8a7dd771ad1382 to your computer and use it in GitHub Desktop.
Mucking around with Full text search and IndexedDB.
import { openDB, IDBPDatabase, DBSchema, IDBPTransaction,IDBCursorDirection } from "idb/with-async-ittr";
import { stemmer } from "./stemmer";
const TBL_ENTRY = "FTSEntry";
const IDX_BY_SCORE = "byScore";
const IDX_BY_DOC_ID = "byDocId";
type FTSEntry = { id: any, word: string , count: number };
interface FullTextSchema extends DBSchema {
[TBL_ENTRY]: { key: number, value: FTSEntry, indexes: { [IDX_BY_SCORE]: [string, number], [IDX_BY_DOC_ID]: any } }
}
type Tokenizer = (input: string) => string[]
type TokenProcessor = (input: string[]) => string[]
const defaultTokenizer = input => input.replace(/[^\w\s]/," ").split(/\s/)
const processorWrapper = (fn: (a:string) => string) => input => input.map(fn);
const lowercaseProcessor = processorWrapper( a => a.toLowerCase());
const DEFAULT_STOP_WORDS = [
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
'do', 'at', 'this', 'but', 'his', 'by', 'from'
];
const stopwordProcessorFactory = stopwords => input => input.filter(s => !stopwords.includes(s));
const blankBlcoker = input => input.filter(s => s!="");
const stemmerProcessor = processorWrapper( a => stemmer(a));
export class FullTextDb {
private db: IDBPDatabase<FullTextSchema>;
private tokenizer: Tokenizer = defaultTokenizer;
private processors: TokenProcessor[] = [
lowercaseProcessor,
blankBlcoker,
stopwordProcessorFactory(DEFAULT_STOP_WORDS),
stemmerProcessor
];
async open(name:string)
{
this.db = await openDB<FullTextSchema>(name, 1, {
blocked()
{
alert("BLOCKED");
},
terminated()
{
alert("TERMINATED");
},
upgrade(db, oldVersion, newVersion, txn) {
const entry = db.createObjectStore(TBL_ENTRY,{ autoIncrement: true })
entry.createIndex(IDX_BY_DOC_ID, "id");
entry.createIndex(IDX_BY_SCORE, ["word", "count"]);
}
});
}
async index(id: any, corpus: string)
{
const rawTokens = this.tokenizer(corpus);
const processedTokens = this.processors.reduce( (t,cb) => cb(t), rawTokens);
const tokenCount = processedTokens.reduce( (o, v) => {
o[v] = (o[v] ?? 0) + 1;
return o;
}, {}) as Record<string, number>;
// console.log({
// rawTokens,
// processedTokens,
// tokenCount
// });
const txn = this.db.transaction(TBL_ENTRY,"readwrite");
const oldKeys = await txn.store.index(IDX_BY_DOC_ID).getAllKeys(IDBKeyRange.only(id));
for(let oldKey of oldKeys)
{
txn.store.delete(oldKey);
}
for(let [word, count] of Object.entries(tokenCount)){
txn.store.put({
id,
word,
count
})
}
await txn.done;
}
async search(terms: string)
{
const rawTokens = this.tokenizer(terms);
const processedTokens = Array.from(new Set(this.processors.reduce( (t,cb) => cb(t), rawTokens)));
const txn = this.db.transaction(TBL_ENTRY, "readonly");
const idx = txn.store.index(IDX_BY_SCORE);
const results = {};
const [
totalDocCount,
cursors
] = await Promise.all([
(async () => {
let c = 0;
const it = txn.store.index(IDX_BY_DOC_ID).iterate(null, "nextunique")
for await(let i of it)
{
c++;
}
return c;
})(),
Promise.all(
processedTokens.map(
async tkn => {
const key = IDBKeyRange.bound([tkn,1], [tkn, Number.MAX_SAFE_INTEGER]);
return {
cursor: idx.iterate(key, "prev"),
total: await idx.count(key)
};
}
)
)
]);
for(let {cursor, total} of cursors)
{
for await (let entry of cursor)
{
results[entry.value.id] = {
...(results[entry.value.id] ?? {}),
[entry.value.word]: entry.value.count * Math.log( totalDocCount / total )
}
}
}
const ranked = Object.entries( results ).map( ([ id, scores ]) => {
const totalScore = Object.values(scores).reduce( (a,b) => a+b, 0);
return { id, scores, totalScore };
}).sort( ({totalScore:a},{totalScore: b}) => a == b ? 0 : Math.sign(b-a) );
return ranked;
}
close()
{
this.db.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment