Skip to content

Instantly share code, notes, and snippets.

@hinrik
Created February 16, 2010 17:57
Show Gist options
  • Save hinrik/305741 to your computer and use it in GitHub Desktop.
Save hinrik/305741 to your computer and use it in GitHub Desktop.
Supplying a custom FTS3 tokenizer to SQLite from Perl
use strict;
use warnings;
use Inline C => Config => LIBS => '-lsqlite3';
use Inline C => <<'END';
#include <sqlite3.h>
/* Not included in sqlite.h */
#define SQLITE_PRIVATE static
#define SQLITE_ENABLE_FTS3 1
#define UNUSED_PARAMETER(x) (void)(x)
#define UNUSED_PARAMETER2(x,y) UNUSED_PARAMETER(x),UNUSED_PARAMETER(y)
/************** Begin file fts3_tokenizer.h **********************************/
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS3_TOKENIZER_H_
#define _FTS3_TOKENIZER_H_
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
/*
** Structures used by the tokenizer interface. When a new tokenizer
** implementation is registered, the caller provides a pointer to
** an sqlite3_tokenizer_module containing pointers to the callback
** functions that make up an implementation.
**
** When an fts3 table is created, it passes any arguments passed to
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
** implementation. The xCreate() function in turn returns an
** sqlite3_tokenizer structure representing the specific tokenizer to
** be used for the fts3 table (customized by the tokenizer clause arguments).
**
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
** method is called. It returns an sqlite3_tokenizer_cursor object
** that may be used to tokenize a specific input buffer based on
** the tokenization rules supplied by a specific sqlite3_tokenizer
** object.
*/
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
struct sqlite3_tokenizer_module {
/*
** Structure version. Should always be set to 0.
*/
int iVersion;
/*
** Create a new tokenizer. The values in the argv[] array are the
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
** TABLE statement that created the fts3 table. For example, if
** the following SQL is executed:
**
** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
**
** then argc is set to 2, and the argv[] array contains pointers
** to the strings "arg1" and "arg2".
**
** This method should return either SQLITE_OK (0), or an SQLite error
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
** to point at the newly created tokenizer structure. The generic
** sqlite3_tokenizer.pModule variable should not be initialised by
** this callback. The caller will do so.
*/
int (*xCreate)(
int argc, /* Size of argv array */
const char *const*argv, /* Tokenizer argument strings */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
);
/*
** Destroy an existing tokenizer. The fts3 module calls this method
** exactly once for each successful call to xCreate().
*/
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Create a tokenizer cursor to tokenize an input buffer. The caller
** is responsible for ensuring that the input buffer remains valid
** until the cursor is closed (using the xClose() method).
*/
int (*xOpen)(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
);
/*
** Destroy an existing tokenizer cursor. The fts3 module calls this
** method exactly once for each successful call to xOpen().
*/
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
/*
** Retrieve the next token from the tokenizer cursor pCursor. This
** method should either return SQLITE_OK and set the values of the
** "OUT" variables identified below, or SQLITE_DONE to indicate that
** the end of the buffer has been reached, or an SQLite error code.
**
** *ppToken should be set to point at a buffer containing the
** normalized version of the token (i.e. after any case-folding and/or
** stemming has been performed). *pnBytes should be set to the length
** of this buffer in bytes. The input text that generated the token is
** identified by the byte offsets returned in *piStartOffset and
** *piEndOffset. *piStartOffset should be set to the index of the first
** byte of the token in the input buffer. *piEndOffset should be set
** to the index of the first byte just past the end of the token in
** the input buffer.
**
** The buffer *ppToken is set to point at is managed by the tokenizer
** implementation. It is only required to be valid until the next call
** to xNext() or xClose().
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xNext)(
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
int *piPosition /* OUT: Number of tokens returned before this one */
);
};
struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
int fts3_global_term_cnt(int iTerm, int iCol);
int fts3_term_cnt(int iTerm, int iCol);
#endif /* _FTS3_TOKENIZER_H_ */
/************** End of fts3_tokenizer.h **************************************/
/************** Begin file fts3_tokenizer1.c *********************************/
/*
** 2006 Oct 10
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** Implementation of the "simple" full-text-search tokenizer.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS3 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS3 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
typedef struct simple_tokenizer {
sqlite3_tokenizer base;
char delim[128]; /* flag ASCII delimiters */
} simple_tokenizer;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *pInput; /* input we are tokenizing */
int nBytes; /* size of the input */
int iOffset; /* current position in pInput */
int iToken; /* index of next token to be returned */
char *pToken; /* storage for current token */
int nTokenAllocated; /* space allocated to zToken buffer */
} simple_tokenizer_cursor;
static int simpleDelim(simple_tokenizer *t, unsigned char c){
return c<0x80 && t->delim[c];
}
/*
** Create a new tokenizer instance.
*/
static int simpleCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
simple_tokenizer *t;
t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
/* TODO(shess) Delimiters need to remain the same from run to run,
** else we need to reindex. One solution would be a meta-table to
** track such information in the database, then we'd only want this
** information on the initial create.
*/
if( argc>1 ){
int i, n = (int)strlen(argv[1]);
for(i=0; i<n; i++){
unsigned char ch = argv[1][i];
/* We explicitly don't support UTF-8 delimiters for now. */
if( ch>=0x80 ){
sqlite3_free(t);
return SQLITE_ERROR;
}
t->delim[ch] = 1;
}
} else {
/* Mark whitespace as a delimiter */
int i;
for(i=1; i<0x80; i++){
t->delim[i] = !isgraph(i);
}
t->delim[39] = 0; // apostrophe
t->delim[42] = 1; // asterisk
}
*ppTokenizer = &t->base;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is pInput[0..nBytes-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int simpleOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *pInput, int nBytes, /* String to be tokenized */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
simple_tokenizer_cursor *c;
UNUSED_PARAMETER(pTokenizer);
c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
if( c==NULL ) return SQLITE_NOMEM;
c->pInput = pInput;
if( pInput==0 ){
c->nBytes = 0;
}else if( nBytes<0 ){
c->nBytes = (int)strlen(pInput);
}else{
c->nBytes = nBytes;
}
c->iOffset = 0; /* start tokenizing at the beginning */
c->iToken = 0;
c->pToken = NULL; /* no space allocated, yet. */
c->nTokenAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** simpleOpen() above.
*/
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
sqlite3_free(c->pToken);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int simpleNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
unsigned char *p = (unsigned char *)c->pInput;
while( c->iOffset<c->nBytes ){
int iStartOffset;
/* Scan past delimiter characters */
while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
/* Count non-delimiter characters. */
iStartOffset = c->iOffset;
while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
if( c->iOffset>iStartOffset ){
int i, n = c->iOffset-iStartOffset;
if( n>c->nTokenAllocated ){
c->nTokenAllocated = n+20;
c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
if( c->pToken==NULL ) return SQLITE_NOMEM;
}
for(i=0; i<n; i++){
/* TODO(shess) This needs expansion to handle UTF-8
** case-insensitivity.
*/
unsigned char ch = p[iStartOffset+i];
c->pToken[i] = (char)(ch<0x80 ? tolower(ch) : ch);
}
*ppToken = c->pToken;
*pnBytes = n;
*piStartOffset = iStartOffset;
*piEndOffset = c->iOffset;
*piPosition = c->iToken++;
return SQLITE_OK;
}
}
return SQLITE_DONE;
}
/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module simpleTokenizerModule = {
0,
simpleCreate,
simpleDestroy,
simpleOpen,
simpleClose,
simpleNext,
};
/*
** Allocate a new simple tokenizer. Return a pointer to the new
** tokenizer in *ppModule
*/
SQLITE_PRIVATE void sqlite3Fts3SimpleTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &simpleTokenizerModule;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
/************** End of fts3_tokenizer1.c *************************************/
/* Our own code */
void* get_tokenizer_ptr() {
sqlite3_tokenizer_module *p;
sqlite3Fts3SimpleTokenizerModule(&p);
return (void *)p;
}
END
use Data::Dump 'dump';
use DBD::SQLite;
use DBI ':sql_types';
use Test::More tests => 4;
my $dbh = DBI->connect(
'dbi:SQLite:dbname=:memory:',
'',
'',
{
sqlite_unicode => 1,
RaiseError => 1,
},
);
my $ptr = get_tokenizer_ptr();
my $pptr = pack('L!', $ptr);
my $sth = $dbh->prepare('SELECT fts3_tokenizer(?, ?)');
$sth->bind_param(1, 'nonwhitespace');
$sth->bind_param(2, $pptr, SQL_BLOB);
$sth->execute();
# simple
$dbh->do('CREATE VIRTUAL TABLE foo1 USING fts3()');
$dbh->do("INSERT INTO foo1 VALUES('bar baz: : quux')");
my $first = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '"baz*"'});
is($first, 'bar baz: : quux', 'simple tokenizer finds alphanumeric token');
my $second = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '":*"'});
is($second, undef, "simple tokenizer doesn't find non-alphanumeric token");
# nonwhitespace
$dbh->do('CREATE VIRTUAL TABLE foo2 USING fts3(tokenize=nonwhitespace)');
$dbh->do("INSERT INTO foo2 VALUES('bar baz: : quux')");
$first = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '"baz*"'});
is($first, 'bar baz: : quux', 'nonwhitespace finds alphanumeric token');
$second = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '":*"'});
is($first, 'bar baz: : quux', 'nonwhitespace finds non-alphanumeric token');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment