mingodad/fts5-compress-column.diff

## fts5-compress-column.diff
Index: ext/fts5/fts5Int.h
==================================================================
--- ext/fts5/fts5Int.h
+++ ext/fts5/fts5Int.h
@@ -160,10 +160,11 @@
   char *zDb;                      /* Database holding FTS index (e.g. "main") */
   char *zName;                    /* Name of FTS index */
   int nCol;                       /* Number of columns */
   char **azCol;                   /* Column names */
   u8 *abUnindexed;                /* True for unindexed columns */
+  u8 *abCompressed;                /* True for compressed columns */
   int nPrefix;                    /* Number of prefix indexes */
   int *aPrefix;                   /* Sizes in bytes of nPrefix prefix indexes */
   int eContent;                   /* An FTS5_CONTENT value */
   char *zContent;                 /* content table */
   char *zContentRowid;            /* "content_rowid=" option value */
@@ -183,10 +184,14 @@
   char *zRank;                    /* Name of rank function */
   char *zRankArgs;                /* Arguments to rank function */

   /* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */
   char **pzErrmsg;
+
+  /* Optional registered sqlite function for de/compression */
+  char *zCompressFunc;
+  char *zUnCompressFunc;

 #ifdef SQLITE_DEBUG
   int bPrefixIndex;               /* True to use prefix-indexes */
 #endif
 };

Index: ext/fts5/fts5_config.c
==================================================================
--- ext/fts5/fts5_config.c
+++ ext/fts5/fts5_config.c
@@ -383,10 +383,30 @@
     if( (rc = fts5ConfigSetEnum(aDetail, zArg, &pConfig->eDetail)) ){
       *pzErr = sqlite3_mprintf("malformed detail=... directive");
     }
     return rc;
   }
+
+  if( sqlite3_strnicmp("compress", zCmd, nCmd)==0 ){
+    if( pConfig->zCompressFunc ){
+      *pzErr = sqlite3_mprintf("multiple compress=... directives");
+      rc = SQLITE_ERROR;
+    }else{
+      pConfig->zCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
+    }
+    return rc;
+  }
+
+  if( sqlite3_strnicmp("uncompress", zCmd, nCmd)==0 ){
+    if( pConfig->zUnCompressFunc ){
+      *pzErr = sqlite3_mprintf("multiple uncompress=... directives");
+      rc = SQLITE_ERROR;
+    }else{
+      pConfig->zUnCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
+    }
+    return rc;
+  }

   *pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
   return SQLITE_ERROR;
 }

@@ -470,10 +490,12 @@
     *pzErr = sqlite3_mprintf("reserved fts5 column name: %s", zCol);
     rc = SQLITE_ERROR;
   }else if( zArg ){
     if( 0==sqlite3_stricmp(zArg, "unindexed") ){
       p->abUnindexed[p->nCol] = 1;
+    }else if( 0==sqlite3_stricmp(zArg, "compressed") ){
+      p->abCompressed[p->nCol] = 1;
     }else{
       *pzErr = sqlite3_mprintf("unrecognized column option: %s", zArg);
       rc = SQLITE_ERROR;
     }
   }
@@ -486,19 +508,21 @@
 ** Populate the Fts5Config.zContentExprlist string.
 */
 static int fts5ConfigMakeExprlist(Fts5Config *p){
   int i;
   int rc = SQLITE_OK;
+  const char *zFunc;
   Fts5Buffer buf = {0, 0, 0};

   sqlite3Fts5BufferAppendPrintf(&rc, &buf, "T.%Q", p->zContentRowid);
   if( p->eContent!=FTS5_CONTENT_NONE ){
     for(i=0; i<p->nCol; i++){
+      zFunc = p->abCompressed[i] ? p->zUnCompressFunc : "";
       if( p->eContent==FTS5_CONTENT_EXTERNAL ){
-        sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.%Q", p->azCol[i]);
+        sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.%Q)", zFunc, p->azCol[i]);
       }else{
-        sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.c%d", i);
+        sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.c%d)", zFunc, i);
       }
     }
   }

   assert( p->zContentExprlist==0 );
@@ -535,13 +559,15 @@
   if( pRet==0 ) return SQLITE_NOMEM;
   memset(pRet, 0, sizeof(Fts5Config));
   pRet->db = db;
   pRet->iCookie = -1;

-  nByte = nArg * (sizeof(char*) + sizeof(u8));
+  nByte = nArg * (sizeof(char*) + (sizeof(u8)*2));
   pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte);
   pRet->abUnindexed = (u8*)&pRet->azCol[nArg];
+  pRet->abCompressed = (u8*)&pRet->abUnindexed[nArg];
+  pRet->zCompressFunc = pRet->zUnCompressFunc = NULL;
   pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
   pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
   pRet->bColumnsize = 1;
   pRet->eDetail = FTS5_DETAIL_FULL;
 #ifdef SQLITE_DEBUG
@@ -589,10 +615,26 @@
     }

     sqlite3_free(zOne);
     sqlite3_free(zTwo);
   }
+
+  if( rc==SQLITE_OK ){
+      int i;
+      for(i=0; i<pRet->nCol; i++){
+        if( pRet->abCompressed[i] ){
+          /*if at least one column was declared compress*/
+          if( !pRet->zCompressFunc || !pRet->zUnCompressFunc ) {
+            char const *zMiss = (pRet->zCompressFunc==0 ? "compress" : "uncompress");
+            rc = SQLITE_ERROR;
+            sqlite3Fts3ErrMsg(pzErr, "missing %s parameter in fts4 constructor", zMiss);
+          }
+          break;
+        }
+      }
+  }
+

   /* If a tokenizer= option was successfully parsed, the tokenizer has
   ** already been allocated. Otherwise, allocate an instance of the default
   ** tokenizer (unicode61) now.  */
   if( rc==SQLITE_OK && pRet->pTok==0 ){
@@ -653,10 +695,16 @@
     sqlite3_free(pConfig->zRank);
     sqlite3_free(pConfig->zRankArgs);
     sqlite3_free(pConfig->zContent);
     sqlite3_free(pConfig->zContentRowid);
     sqlite3_free(pConfig->zContentExprlist);
+    if(pConfig->zCompressFunc) {
+       sqlite3_free(pConfig->zCompressFunc);
+    }
+    if(pConfig->zUnCompressFunc) {
+       sqlite3_free(pConfig->zUnCompressFunc);
+    }
     sqlite3_free(pConfig);
   }
 }

 /*

Index: ext/fts5/fts5_storage.c
==================================================================
--- ext/fts5/fts5_storage.c
+++ ext/fts5/fts5_storage.c
@@ -111,19 +111,25 @@

       case FTS5_STMT_INSERT_CONTENT:
       case FTS5_STMT_REPLACE_CONTENT: {
         int nCol = pC->nCol + 1;
         char *zBind;
-        int i;
+	const char *zFunc;
+        int i, zFuncSize, zBindSize, bSizeUsed;

-        zBind = sqlite3_malloc(1 + nCol*2);
+        /* Add 4 to take in account the extra '(?),' */
+        zFuncSize = (int)(pC->zCompressFunc ? strlen(pC->zCompressFunc) : 0)+4;
+        zBindSize = 1 + nCol*zFuncSize;
+        zBind = sqlite3_malloc(zBindSize);
         if( zBind ){
+	  bSizeUsed = 0;
           for(i=0; i<nCol; i++){
-            zBind[i*2] = '?';
-            zBind[i*2 + 1] = ',';
+            zFunc = (i && pC->abCompressed[i-1]) ? pC->zCompressFunc : "";
+            sqlite3_snprintf(zBindSize-bSizeUsed, zBind+bSizeUsed, "%s(?),", zFunc);
+	    bSizeUsed = (int)strlen(zBind);
           }
-          zBind[i*2-1] = '\0';
+          zBind[bSizeUsed-1] = '\0'; /* remove the last comma */
           zSql = sqlite3_mprintf(azStmt[eStmt], pC->zDb, pC->zName, zBind);
           sqlite3_free(zBind);
         }
         break;
       }

Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -233,10 +233,11 @@
 struct Unicode61Tokenizer {
   unsigned char aTokenChar[128];  /* ASCII range token characters */
   char *aFold;                    /* Buffer to fold text into */
   int nFold;                      /* Size of aFold[] in bytes */
   int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int nMinWordSize;           /* Min size of a word to be indexed */
   int nException;
   int *aiException;

   unsigned char aCategory[32];    /* True for token char categories */
 };
@@ -360,10 +361,11 @@
       const char *zCat = "L* N* Co";
       int i;
       memset(p, 0, sizeof(Unicode61Tokenizer));

       p->bRemoveDiacritic = 1;
+      p->nMinWordSize = 0;
       p->nFold = 64;
       p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
       if( p->aFold==0 ){
         rc = SQLITE_NOMEM;
       }
@@ -393,10 +395,14 @@
         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
           rc = fts5UnicodeAddExceptions(p, zArg, 0);
         }else
         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
           /* no-op */
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
+          int mwsz;
+          if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
         }else{
           rc = SQLITE_ERROR;
         }
       }

@@ -450,10 +456,11 @@
   while( rc==SQLITE_OK ){
     int iCode;                    /* non-ASCII codepoint read from input */
     char *zOut = aFold;
     int is;
     int ie;
+    int wsz;

     /* Skip any separator characters. */
     while( 1 ){
       if( zCsr>=zTerm ) goto tokenize_done;
       if( *zCsr & 0x80 ) {
@@ -517,12 +524,15 @@
         zCsr++;
       }
       ie = zCsr - (unsigned char*)pText;
     }

+    wsz = zOut-aFold;
+    /* Check min word size */
+    if(p->nMinWordSize && p->nMinWordSize > wsz) continue;
     /* Invoke the token callback */
-    rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
+    rc = xToken(pCtx, 0, aFold, wsz, is, ie);
   }

  tokenize_done:
   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
   return rc;
	Index: ext/fts5/fts5Int.h
	==================================================================
	--- ext/fts5/fts5Int.h
	+++ ext/fts5/fts5Int.h
	@@ -160,10 +160,11 @@
	char zDb; / Database holding FTS index (e.g. "main") */
	char zName; / Name of FTS index */
	int nCol; /* Number of columns */
	char *azCol; / Column names */
	u8 abUnindexed; / True for unindexed columns */
	+ u8 abCompressed; / True for compressed columns */
	int nPrefix; /* Number of prefix indexes */
	int aPrefix; / Sizes in bytes of nPrefix prefix indexes */
	int eContent; /* An FTS5_CONTENT value */
	char zContent; / content table */
	char zContentRowid; / "content_rowid=" option value */
	@@ -183,10 +184,14 @@
	char zRank; / Name of rank function */
	char zRankArgs; / Arguments to rank function */

	/* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */
	char **pzErrmsg;
	+
	+ /* Optional registered sqlite function for de/compression */
	+ char *zCompressFunc;
	+ char *zUnCompressFunc;

	#ifdef SQLITE_DEBUG
	int bPrefixIndex; /* True to use prefix-indexes */
	#endif
	};

	Index: ext/fts5/fts5_config.c
	==================================================================
	--- ext/fts5/fts5_config.c
	+++ ext/fts5/fts5_config.c
	@@ -383,10 +383,30 @@
	if( (rc = fts5ConfigSetEnum(aDetail, zArg, &pConfig->eDetail)) ){
	*pzErr = sqlite3_mprintf("malformed detail=... directive");
	}
	return rc;
	}
	+
	+ if( sqlite3_strnicmp("compress", zCmd, nCmd)==0 ){
	+ if( pConfig->zCompressFunc ){
	+ *pzErr = sqlite3_mprintf("multiple compress=... directives");
	+ rc = SQLITE_ERROR;
	+ }else{
	+ pConfig->zCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
	+ }
	+ return rc;
	+ }
	+
	+ if( sqlite3_strnicmp("uncompress", zCmd, nCmd)==0 ){
	+ if( pConfig->zUnCompressFunc ){
	+ *pzErr = sqlite3_mprintf("multiple uncompress=... directives");
	+ rc = SQLITE_ERROR;
	+ }else{
	+ pConfig->zUnCompressFunc = sqlite3Fts5Strndup(&rc, zArg, -1);
	+ }
	+ return rc;
	+ }

	pzErr = sqlite3_mprintf("unrecognized option: \"%.s\"", nCmd, zCmd);
	return SQLITE_ERROR;
	}

	@@ -470,10 +490,12 @@
	*pzErr = sqlite3_mprintf("reserved fts5 column name: %s", zCol);
	rc = SQLITE_ERROR;
	}else if( zArg ){
	if( 0==sqlite3_stricmp(zArg, "unindexed") ){
	p->abUnindexed[p->nCol] = 1;
	+ }else if( 0==sqlite3_stricmp(zArg, "compressed") ){
	+ p->abCompressed[p->nCol] = 1;
	}else{
	*pzErr = sqlite3_mprintf("unrecognized column option: %s", zArg);
	rc = SQLITE_ERROR;
	}
	}
	@@ -486,19 +508,21 @@
	** Populate the Fts5Config.zContentExprlist string.
	*/
	static int fts5ConfigMakeExprlist(Fts5Config *p){
	int i;
	int rc = SQLITE_OK;
	+ const char *zFunc;
	Fts5Buffer buf = {0, 0, 0};

	sqlite3Fts5BufferAppendPrintf(&rc, &buf, "T.%Q", p->zContentRowid);
	if( p->eContent!=FTS5_CONTENT_NONE ){
	for(i=0; i<p->nCol; i++){
	+ zFunc = p->abCompressed[i] ? p->zUnCompressFunc : "";
	if( p->eContent==FTS5_CONTENT_EXTERNAL ){
	- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.%Q", p->azCol[i]);
	+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.%Q)", zFunc, p->azCol[i]);
	}else{
	- sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", T.c%d", i);
	+ sqlite3Fts5BufferAppendPrintf(&rc, &buf, ", %s(T.c%d)", zFunc, i);
	}
	}
	}

	assert( p->zContentExprlist==0 );
	@@ -535,13 +559,15 @@
	if( pRet==0 ) return SQLITE_NOMEM;
	memset(pRet, 0, sizeof(Fts5Config));
	pRet->db = db;
	pRet->iCookie = -1;

	- nByte = nArg * (sizeof(char*) + sizeof(u8));
	+ nByte = nArg * (sizeof(char) + (sizeof(u8)2));
	pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte);
	pRet->abUnindexed = (u8*)&pRet->azCol[nArg];
	+ pRet->abCompressed = (u8*)&pRet->abUnindexed[nArg];
	+ pRet->zCompressFunc = pRet->zUnCompressFunc = NULL;
	pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
	pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
	pRet->bColumnsize = 1;
	pRet->eDetail = FTS5_DETAIL_FULL;
	#ifdef SQLITE_DEBUG
	@@ -589,10 +615,26 @@
	}

	sqlite3_free(zOne);
	sqlite3_free(zTwo);
	}
	+
	+ if( rc==SQLITE_OK ){
	+ int i;
	+ for(i=0; i<pRet->nCol; i++){
	+ if( pRet->abCompressed[i] ){
	+ /if at least one column was declared compress/
	+ if( !pRet->zCompressFunc \|\| !pRet->zUnCompressFunc ) {
	+ char const *zMiss = (pRet->zCompressFunc==0 ? "compress" : "uncompress");
	+ rc = SQLITE_ERROR;
	+ sqlite3Fts3ErrMsg(pzErr, "missing %s parameter in fts4 constructor", zMiss);
	+ }
	+ break;
	+ }
	+ }
	+ }
	+

	/* If a tokenizer= option was successfully parsed, the tokenizer has
	** already been allocated. Otherwise, allocate an instance of the default
	** tokenizer (unicode61) now. */
	if( rc==SQLITE_OK && pRet->pTok==0 ){
	@@ -653,10 +695,16 @@
	sqlite3_free(pConfig->zRank);
	sqlite3_free(pConfig->zRankArgs);
	sqlite3_free(pConfig->zContent);
	sqlite3_free(pConfig->zContentRowid);
	sqlite3_free(pConfig->zContentExprlist);
	+ if(pConfig->zCompressFunc) {
	+ sqlite3_free(pConfig->zCompressFunc);
	+ }
	+ if(pConfig->zUnCompressFunc) {
	+ sqlite3_free(pConfig->zUnCompressFunc);
	+ }
	sqlite3_free(pConfig);
	}
	}

	/*

	Index: ext/fts5/fts5_storage.c
	==================================================================
	--- ext/fts5/fts5_storage.c
	+++ ext/fts5/fts5_storage.c
	@@ -111,19 +111,25 @@

	case FTS5_STMT_INSERT_CONTENT:
	case FTS5_STMT_REPLACE_CONTENT: {
	int nCol = pC->nCol + 1;
	char *zBind;
	- int i;
	+ const char *zFunc;
	+ int i, zFuncSize, zBindSize, bSizeUsed;

	- zBind = sqlite3_malloc(1 + nCol*2);
	+ /* Add 4 to take in account the extra '(?),' */
	+ zFuncSize = (int)(pC->zCompressFunc ? strlen(pC->zCompressFunc) : 0)+4;
	+ zBindSize = 1 + nCol*zFuncSize;
	+ zBind = sqlite3_malloc(zBindSize);
	if( zBind ){
	+ bSizeUsed = 0;
	for(i=0; i<nCol; i++){
	- zBind[i*2] = '?';
	- zBind[i*2 + 1] = ',';
	+ zFunc = (i && pC->abCompressed[i-1]) ? pC->zCompressFunc : "";
	+ sqlite3_snprintf(zBindSize-bSizeUsed, zBind+bSizeUsed, "%s(?),", zFunc);
	+ bSizeUsed = (int)strlen(zBind);
	}
	- zBind[i*2-1] = '\0';
	+ zBind[bSizeUsed-1] = '\0'; /* remove the last comma */
	zSql = sqlite3_mprintf(azStmt[eStmt], pC->zDb, pC->zName, zBind);
	sqlite3_free(zBind);
	}
	break;
	}

	Index: ext/fts5/fts5_tokenize.c
	==================================================================
	--- ext/fts5/fts5_tokenize.c
	+++ ext/fts5/fts5_tokenize.c
	@@ -233,10 +233,11 @@
	struct Unicode61Tokenizer {
	unsigned char aTokenChar[128]; /* ASCII range token characters */
	char aFold; / Buffer to fold text into */
	int nFold; /* Size of aFold[] in bytes */
	int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
	+ int nMinWordSize; /* Min size of a word to be indexed */
	int nException;
	int *aiException;

	unsigned char aCategory[32]; /* True for token char categories */
	};
	@@ -360,10 +361,11 @@
	const char zCat = "L N* Co";
	int i;
	memset(p, 0, sizeof(Unicode61Tokenizer));

	p->bRemoveDiacritic = 1;
	+ p->nMinWordSize = 0;
	p->nFold = 64;
	p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
	if( p->aFold==0 ){
	rc = SQLITE_NOMEM;
	}
	@@ -393,10 +395,14 @@
	if( 0==sqlite3_stricmp(azArg[i], "separators") ){
	rc = fts5UnicodeAddExceptions(p, zArg, 0);
	}else
	if( 0==sqlite3_stricmp(azArg[i], "categories") ){
	/* no-op */
	+ }else
	+ if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
	+ int mwsz;
	+ if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
	}else{
	rc = SQLITE_ERROR;
	}
	}

	@@ -450,10 +456,11 @@
	while( rc==SQLITE_OK ){
	int iCode; /* non-ASCII codepoint read from input */
	char *zOut = aFold;
	int is;
	int ie;
	+ int wsz;

	/* Skip any separator characters. */
	while( 1 ){
	if( zCsr>=zTerm ) goto tokenize_done;
	if( *zCsr & 0x80 ) {
	@@ -517,12 +524,15 @@
	zCsr++;
	}
	ie = zCsr - (unsigned char*)pText;
	}

	+ wsz = zOut-aFold;
	+ /* Check min word size */
	+ if(p->nMinWordSize && p->nMinWordSize > wsz) continue;
	/* Invoke the token callback */
	- rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
	+ rc = xToken(pCtx, 0, aFold, wsz, is, ie);
	}

	tokenize_done:
	if( rc==SQLITE_DONE ) rc = SQLITE_OK;
	return rc;