Skip to content

Instantly share code, notes, and snippets.

@mingodad
Created September 24, 2018 13:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mingodad/7fdec8eebdde70ee388db60855760c72 to your computer and use it in GitHub Desktop.
Save mingodad/7fdec8eebdde70ee388db60855760c72 to your computer and use it in GitHub Desktop.
Implementation of "compressed" and "min_word_size" option for columns in fts5 for sqlite3
Index: ext/fts5/fts5Int.h
==================================================================
--- ext/fts5/fts5Int.h
+++ ext/fts5/fts5Int.h
@@ -160,10 +160,11 @@
char *zDb; /* Database holding FTS index (e.g. "main") */
char *zName; /* Name of FTS index */
int nCol; /* Number of columns */
char **azCol; /* Column names */
u8 *abUnindexed; /* True for unindexed columns */
+ u8 *abCompressed; /* True for compressed columns */
int nPrefix; /* Number of prefix indexes */
int *aPrefix; /* Sizes in bytes of nPrefix prefix indexes */
int eContent; /* An FTS5_CONTENT value */
char *zContent; /* content table */
char *zContentRowid; /* "content_rowid=" option value */
@@ -183,10 +184,14 @@
char *zRank; /* Name of rank function */
char *zRankArgs; /* Arguments to rank function */
/* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */
char **pzErrmsg;
+
+ /* Optional registered sqlite function for de/compression */
+ char *zCompressFunc;
+ char *zUnCompressFunc;
#ifdef SQLITE_DEBUG
int bPrefixIndex; /* True to use prefix-indexes */
#endif
};
Index: ext/fts5/fts5_config.c
==================================================================
--- ext/fts5/fts5_config.c
+++ ext/fts5/fts5_config.c
@@ -383,10 +383,30 @@
if( (rc = fts5ConfigSetEnum(aDetail, zArg, &pConfig->eDetail)) ){
*pzErr = sqlite3_mprintf("malformed detail=... directive");
}
return rc;
}
+
+ if( sqlite3_strnicmp("compress", zCmd, nCmd)==0 ){
+ if( pConfig->zCompressFunc ){
+ *pzErr = sqlite3_mprintf("multiple compress=... directives");
+ rc = SQLITE_ERROR;
+ }else{
+ pConfig->zCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
+ }
+ return rc;
+ }
+
+ if( sqlite3_strnicmp("uncompress", zCmd, nCmd)==0 ){
+ if( pConfig->zUnCompressFunc ){
+ *pzErr = sqlite3_mprintf("multiple uncompress=... directives");
+ rc = SQLITE_ERROR;
+ }else{
+ pConfig->zUnCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
+ }
+ return rc;
+ }
*pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
return SQLITE_ERROR;
}
@@ -470,10 +490,12 @@
*pzErr = sqlite3_mprintf("reserved fts5 column name: %s", zCol);
rc = SQLITE_ERROR;
}else if( zArg ){
if( 0==sqlite3_stricmp(zArg, "unindexed") ){
p->abUnindexed[p->nCol] = 1;
+ }else if( 0==sqlite3_stricmp(zArg, "compressed") ){
+ p->abCompressed[p->nCol] = 1;
}else{
*pzErr = sqlite3_mprintf("unrecognized column option: %s", zArg);
rc = SQLITE_ERROR;
}
}
@@ -486,19 +508,21 @@
** Populate the Fts5Config.zContentExprlist string.
*/
static int fts5ConfigMakeExprlist(Fts5Config *p){
int i;
int rc = SQLITE_OK;
+ const char *zFunc;
Fts5Buffer buf = {0, 0, 0};
sqlite3Fts5BufferAppendPrintf(&rc, &buf, "T.%Q", p->zContentRowid);
if( p->eContent!=FTS5_CONTENT_NONE ){
for(i=0; i<p->nCol; i++){
+ zFunc = p->abCompressed[i] ? p->zUnCompressFunc : "";
if( p->eContent==FTS5_CONTENT_EXTERNAL ){
- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.%Q", p->azCol[i]);
+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.%Q)", zFunc, p->azCol[i]);
}else{
- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.c%d", i);
+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.c%d)", zFunc, i);
}
}
}
assert( p->zContentExprlist==0 );
@@ -535,13 +559,15 @@
if( pRet==0 ) return SQLITE_NOMEM;
memset(pRet, 0, sizeof(Fts5Config));
pRet->db = db;
pRet->iCookie = -1;
- nByte = nArg * (sizeof(char*) + sizeof(u8));
+ nByte = nArg * (sizeof(char*) + (sizeof(u8)*2));
pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte);
pRet->abUnindexed = (u8*)&pRet->azCol[nArg];
+ pRet->abCompressed = (u8*)&pRet->abUnindexed[nArg];
+ pRet->zCompressFunc = pRet->zUnCompressFunc = NULL;
pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
pRet->bColumnsize = 1;
pRet->eDetail = FTS5_DETAIL_FULL;
#ifdef SQLITE_DEBUG
@@ -589,10 +615,26 @@
}
sqlite3_free(zOne);
sqlite3_free(zTwo);
}
+
+ if( rc==SQLITE_OK ){
+ int i;
+ for(i=0; i<pRet->nCol; i++){
+ if( pRet->abCompressed[i] ){
+ /*if at least one column was declared compress*/
+ if( !pRet->zCompressFunc || !pRet->zUnCompressFunc ) {
+ char const *zMiss = (pRet->zCompressFunc==0 ? "compress" : "uncompress");
+ rc = SQLITE_ERROR;
+ sqlite3Fts3ErrMsg(pzErr, "missing %s parameter in fts4 constructor", zMiss);
+ }
+ break;
+ }
+ }
+ }
+
/* If a tokenizer= option was successfully parsed, the tokenizer has
** already been allocated. Otherwise, allocate an instance of the default
** tokenizer (unicode61) now. */
if( rc==SQLITE_OK && pRet->pTok==0 ){
@@ -653,10 +695,16 @@
sqlite3_free(pConfig->zRank);
sqlite3_free(pConfig->zRankArgs);
sqlite3_free(pConfig->zContent);
sqlite3_free(pConfig->zContentRowid);
sqlite3_free(pConfig->zContentExprlist);
+ if(pConfig->zCompressFunc) {
+ sqlite3_free(pConfig->zCompressFunc);
+ }
+ if(pConfig->zUnCompressFunc) {
+ sqlite3_free(pConfig->zUnCompressFunc);
+ }
sqlite3_free(pConfig);
}
}
/*
Index: ext/fts5/fts5_storage.c
==================================================================
--- ext/fts5/fts5_storage.c
+++ ext/fts5/fts5_storage.c
@@ -111,19 +111,25 @@
case FTS5_STMT_INSERT_CONTENT:
case FTS5_STMT_REPLACE_CONTENT: {
int nCol = pC->nCol + 1;
char *zBind;
- int i;
+ const char *zFunc;
+ int i, zFuncSize, zBindSize, bSizeUsed;
- zBind = sqlite3_malloc(1 + nCol*2);
+ /* Add 4 to take in account the extra '(?),' */
+ zFuncSize = (int)(pC->zCompressFunc ? strlen(pC->zCompressFunc) : 0)+4;
+ zBindSize = 1 + nCol*zFuncSize;
+ zBind = sqlite3_malloc(zBindSize);
if( zBind ){
+ bSizeUsed = 0;
for(i=0; i<nCol; i++){
- zBind[i*2] = '?';
- zBind[i*2 + 1] = ',';
+ zFunc = (i && pC->abCompressed[i-1]) ? pC->zCompressFunc : "";
+ sqlite3_snprintf(zBindSize-bSizeUsed, zBind+bSizeUsed, "%s(?),", zFunc);
+ bSizeUsed = (int)strlen(zBind);
}
- zBind[i*2-1] = '\0';
+ zBind[bSizeUsed-1] = '\0'; /* remove the last comma */
zSql = sqlite3_mprintf(azStmt[eStmt], pC->zDb, pC->zName, zBind);
sqlite3_free(zBind);
}
break;
}
Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -233,10 +233,11 @@
struct Unicode61Tokenizer {
unsigned char aTokenChar[128]; /* ASCII range token characters */
char *aFold; /* Buffer to fold text into */
int nFold; /* Size of aFold[] in bytes */
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
+ int nMinWordSize; /* Min size of a word to be indexed */
int nException;
int *aiException;
unsigned char aCategory[32]; /* True for token char categories */
};
@@ -360,10 +361,11 @@
const char *zCat = "L* N* Co";
int i;
memset(p, 0, sizeof(Unicode61Tokenizer));
p->bRemoveDiacritic = 1;
+ p->nMinWordSize = 0;
p->nFold = 64;
p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
if( p->aFold==0 ){
rc = SQLITE_NOMEM;
}
@@ -393,10 +395,14 @@
if( 0==sqlite3_stricmp(azArg[i], "separators") ){
rc = fts5UnicodeAddExceptions(p, zArg, 0);
}else
if( 0==sqlite3_stricmp(azArg[i], "categories") ){
/* no-op */
+ }else
+ if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
+ int mwsz;
+ if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
}else{
rc = SQLITE_ERROR;
}
}
@@ -450,10 +456,11 @@
while( rc==SQLITE_OK ){
int iCode; /* non-ASCII codepoint read from input */
char *zOut = aFold;
int is;
int ie;
+ int wsz;
/* Skip any separator characters. */
while( 1 ){
if( zCsr>=zTerm ) goto tokenize_done;
if( *zCsr & 0x80 ) {
@@ -517,12 +524,15 @@
zCsr++;
}
ie = zCsr - (unsigned char*)pText;
}
+ wsz = zOut-aFold;
+ /* Check min word size */
+ if(p->nMinWordSize && p->nMinWordSize > wsz) continue;
/* Invoke the token callback */
- rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
+ rc = xToken(pCtx, 0, aFold, wsz, is, ie);
}
tokenize_done:
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
return rc;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment