hinrik/gist:305741

## gistfile1.PL
use strict;
use warnings;
use Inline C => Config => LIBS => '-lsqlite3';
use Inline C => <<'END';
#include <sqlite3.h>

/* Not included in sqlite.h */
#define SQLITE_PRIVATE static
#define SQLITE_ENABLE_FTS3 1
#define UNUSED_PARAMETER(x) (void)(x)
#define UNUSED_PARAMETER2(x,y) UNUSED_PARAMETER(x),UNUSED_PARAMETER(y)

/************** Begin file fts3_tokenizer.h **********************************/
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search.  There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions.  This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS3_TOKENIZER_H_
#define _FTS3_TOKENIZER_H_

/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/

/*
** Structures used by the tokenizer interface. When a new tokenizer
** implementation is registered, the caller provides a pointer to
** an sqlite3_tokenizer_module containing pointers to the callback
** functions that make up an implementation.
**
** When an fts3 table is created, it passes any arguments passed to
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
** implementation. The xCreate() function in turn returns an
** sqlite3_tokenizer structure representing the specific tokenizer to
** be used for the fts3 table (customized by the tokenizer clause arguments).
**
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
** method is called. It returns an sqlite3_tokenizer_cursor object
** that may be used to tokenize a specific input buffer based on
** the tokenization rules supplied by a specific sqlite3_tokenizer
** object.
*/
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;

struct sqlite3_tokenizer_module {

  /*
  ** Structure version. Should always be set to 0.
  */
  int iVersion;

  /*
  ** Create a new tokenizer. The values in the argv[] array are the
  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  ** TABLE statement that created the fts3 table. For example, if
  ** the following SQL is executed:
  **
  **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
  **
  ** then argc is set to 2, and the argv[] array contains pointers
  ** to the strings "arg1" and "arg2".
  **
  ** This method should return either SQLITE_OK (0), or an SQLite error
  ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
  ** to point at the newly created tokenizer structure. The generic
  ** sqlite3_tokenizer.pModule variable should not be initialised by
  ** this callback. The caller will do so.
  */
  int (*xCreate)(
    int argc,                           /* Size of argv array */
    const char *const*argv,             /* Tokenizer argument strings */
    sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
  );

  /*
  ** Destroy an existing tokenizer. The fts3 module calls this method
  ** exactly once for each successful call to xCreate().
  */
  int (*xDestroy)(sqlite3_tokenizer *pTokenizer);

  /*
  ** Create a tokenizer cursor to tokenize an input buffer. The caller
  ** is responsible for ensuring that the input buffer remains valid
  ** until the cursor is closed (using the xClose() method).
  */
  int (*xOpen)(
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
    const char *pInput, int nBytes,      /* Input buffer */
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
  );

  /*
  ** Destroy an existing tokenizer cursor. The fts3 module calls this
  ** method exactly once for each successful call to xOpen().
  */
  int (*xClose)(sqlite3_tokenizer_cursor *pCursor);

  /*
  ** Retrieve the next token from the tokenizer cursor pCursor. This
  ** method should either return SQLITE_OK and set the values of the
  ** "OUT" variables identified below, or SQLITE_DONE to indicate that
  ** the end of the buffer has been reached, or an SQLite error code.
  **
  ** *ppToken should be set to point at a buffer containing the
  ** normalized version of the token (i.e. after any case-folding and/or
  ** stemming has been performed). *pnBytes should be set to the length
  ** of this buffer in bytes. The input text that generated the token is
  ** identified by the byte offsets returned in *piStartOffset and
  ** *piEndOffset. *piStartOffset should be set to the index of the first
  ** byte of the token in the input buffer. *piEndOffset should be set
  ** to the index of the first byte just past the end of the token in
  ** the input buffer.
  **
  ** The buffer *ppToken is set to point at is managed by the tokenizer
  ** implementation. It is only required to be valid until the next call
  ** to xNext() or xClose().
  */
  /* TODO(shess) current implementation requires pInput to be
  ** nul-terminated.  This should either be fixed, or pInput/nBytes
  ** should be converted to zInput.
  */
  int (*xNext)(
    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
    int *piPosition      /* OUT: Number of tokens returned before this one */
  );
};

struct sqlite3_tokenizer {
  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
  /* Tokenizer implementations will typically add additional fields */
};

struct sqlite3_tokenizer_cursor {
  sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
  /* Tokenizer implementations will typically add additional fields */
};

int fts3_global_term_cnt(int iTerm, int iCol);
int fts3_term_cnt(int iTerm, int iCol);


#endif /* _FTS3_TOKENIZER_H_ */

/************** End of fts3_tokenizer.h **************************************/

/************** Begin file fts3_tokenizer1.c *********************************/
/*
** 2006 Oct 10
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
**
** Implementation of the "simple" full-text-search tokenizer.
*/

/*
** The code in this file is only compiled if:
**
**     * The FTS3 module is being built as an extension
**       (in which case SQLITE_CORE is not defined), or
**
**     * The FTS3 module is being built into the core of
**       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)

typedef struct simple_tokenizer {
  sqlite3_tokenizer base;
  char delim[128];             /* flag ASCII delimiters */
} simple_tokenizer;

typedef struct simple_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *pInput;          /* input we are tokenizing */
  int nBytes;                  /* size of the input */
  int iOffset;                 /* current position in pInput */
  int iToken;                  /* index of next token to be returned */
  char *pToken;                /* storage for current token */
  int nTokenAllocated;         /* space allocated to zToken buffer */
} simple_tokenizer_cursor;


static int simpleDelim(simple_tokenizer *t, unsigned char c){
  return c<0x80 && t->delim[c];
}

/*
** Create a new tokenizer instance.
*/
static int simpleCreate(
  int argc, const char * const *argv,
  sqlite3_tokenizer **ppTokenizer
){
  simple_tokenizer *t;

  t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
  if( t==NULL ) return SQLITE_NOMEM;
  memset(t, 0, sizeof(*t));

  /* TODO(shess) Delimiters need to remain the same from run to run,
  ** else we need to reindex.  One solution would be a meta-table to
  ** track such information in the database, then we'd only want this
  ** information on the initial create.
  */
  if( argc>1 ){
    int i, n = (int)strlen(argv[1]);
    for(i=0; i<n; i++){
      unsigned char ch = argv[1][i];
      /* We explicitly don't support UTF-8 delimiters for now. */
      if( ch>=0x80 ){
        sqlite3_free(t);
        return SQLITE_ERROR;
      }
      t->delim[ch] = 1;
    }
  } else {
    /* Mark whitespace as a delimiter */
    int i;
    for(i=1; i<0x80; i++){
      t->delim[i] = !isgraph(i);
    }
    t->delim[39] = 0; // apostrophe
    t->delim[42] = 1; // asterisk
  }

  *ppTokenizer = &t->base;
  return SQLITE_OK;
}

/*
** Destroy a tokenizer
*/
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  sqlite3_free(pTokenizer);
  return SQLITE_OK;
}

/*
** Prepare to begin tokenizing a particular string.  The input
** string to be tokenized is pInput[0..nBytes-1].  A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int simpleOpen(
  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
  const char *pInput, int nBytes,        /* String to be tokenized */
  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
){
  simple_tokenizer_cursor *c;

  UNUSED_PARAMETER(pTokenizer);

  c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
  if( c==NULL ) return SQLITE_NOMEM;

  c->pInput = pInput;
  if( pInput==0 ){
    c->nBytes = 0;
  }else if( nBytes<0 ){
    c->nBytes = (int)strlen(pInput);
  }else{
    c->nBytes = nBytes;
  }
  c->iOffset = 0;                 /* start tokenizing at the beginning */
  c->iToken = 0;
  c->pToken = NULL;               /* no space allocated, yet. */
  c->nTokenAllocated = 0;

  *ppCursor = &c->base;
  return SQLITE_OK;
}

/*
** Close a tokenization cursor previously opened by a call to
** simpleOpen() above.
*/
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  sqlite3_free(c->pToken);
  sqlite3_free(c);
  return SQLITE_OK;
}

/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  unsigned char *p = (unsigned char *)c->pInput;

  while( c->iOffset<c->nBytes ){
    int iStartOffset;

    /* Scan past delimiter characters */
    while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    if( c->iOffset>iStartOffset ){
      int i, n = c->iOffset-iStartOffset;
      if( n>c->nTokenAllocated ){
        c->nTokenAllocated = n+20;
        c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
        if( c->pToken==NULL ) return SQLITE_NOMEM;
      }
      for(i=0; i<n; i++){
        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        unsigned char ch = p[iStartOffset+i];
        c->pToken[i] = (char)(ch<0x80 ? tolower(ch) : ch);
      }
      *ppToken = c->pToken;
      *pnBytes = n;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;

      return SQLITE_OK;
    }
  }
  return SQLITE_DONE;
}

/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module simpleTokenizerModule = {
  0,
  simpleCreate,
  simpleDestroy,
  simpleOpen,
  simpleClose,
  simpleNext,
};

/*
** Allocate a new simple tokenizer.  Return a pointer to the new
** tokenizer in *ppModule
*/
SQLITE_PRIVATE void sqlite3Fts3SimpleTokenizerModule(
  sqlite3_tokenizer_module const**ppModule
){
  *ppModule = &simpleTokenizerModule;
}

#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

/************** End of fts3_tokenizer1.c *************************************/

/* Our own code */
void* get_tokenizer_ptr() {
    sqlite3_tokenizer_module *p;
    sqlite3Fts3SimpleTokenizerModule(&p);
    return (void *)p;
}
END


use Data::Dump 'dump';
use DBD::SQLite;
use DBI ':sql_types';
use Test::More tests => 4;

my $dbh = DBI->connect(
    'dbi:SQLite:dbname=:memory:',
    '',
    '',
    {
        sqlite_unicode => 1,
        RaiseError => 1,
    },
);

my $ptr = get_tokenizer_ptr();
my $pptr = pack('L!', $ptr);

my $sth = $dbh->prepare('SELECT fts3_tokenizer(?, ?)');
$sth->bind_param(1, 'nonwhitespace');
$sth->bind_param(2, $pptr, SQL_BLOB);
$sth->execute();

# simple
$dbh->do('CREATE VIRTUAL TABLE foo1 USING fts3()');
$dbh->do("INSERT INTO foo1 VALUES('bar baz: : quux')");
my $first = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '"baz*"'});
is($first, 'bar baz: : quux', 'simple tokenizer finds alphanumeric token');
my $second = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '":*"'});
is($second, undef, "simple tokenizer doesn't find non-alphanumeric token");

# nonwhitespace
$dbh->do('CREATE VIRTUAL TABLE foo2 USING fts3(tokenize=nonwhitespace)');
$dbh->do("INSERT INTO foo2 VALUES('bar baz: : quux')");
$first = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '"baz*"'});
is($first, 'bar baz: : quux', 'nonwhitespace finds alphanumeric token');
$second = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '":*"'});
is($first, 'bar baz: : quux', 'nonwhitespace finds non-alphanumeric token');
	use strict;
	use warnings;
	use Inline C => Config => LIBS => '-lsqlite3';
	use Inline C => <<'END';
	#include <sqlite3.h>

	/* Not included in sqlite.h */
	#define SQLITE_PRIVATE static
	#define SQLITE_ENABLE_FTS3 1
	#define UNUSED_PARAMETER(x) (void)(x)
	#define UNUSED_PARAMETER2(x,y) UNUSED_PARAMETER(x),UNUSED_PARAMETER(y)

	/************ Begin file fts3_tokenizer.h ********************************/
	/*
	** 2006 July 10
	**
	** The author disclaims copyright to this source code.
	**
	*************************************************************************
	** Defines the interface to tokenizers used by fulltext-search. There
	** are three basic components:
	**
	** sqlite3_tokenizer_module is a singleton defining the tokenizer
	** interface functions. This is essentially the class structure for
	** tokenizers.
	**
	** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
	** including customization information defined at creation time.
	**
	** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
	** tokens from a particular input.
	*/
	#ifndef _FTS3_TOKENIZER_H_
	#define _FTS3_TOKENIZER_H_

	/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
	** If tokenizers are to be allowed to call sqlite3_*() functions, then
	** we will need a way to register the API consistently.
	*/

	/*
	** Structures used by the tokenizer interface. When a new tokenizer
	** implementation is registered, the caller provides a pointer to
	** an sqlite3_tokenizer_module containing pointers to the callback
	** functions that make up an implementation.
	**
	** When an fts3 table is created, it passes any arguments passed to
	** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
	** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
	** implementation. The xCreate() function in turn returns an
	** sqlite3_tokenizer structure representing the specific tokenizer to
	** be used for the fts3 table (customized by the tokenizer clause arguments).
	**
	** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
	** method is called. It returns an sqlite3_tokenizer_cursor object
	** that may be used to tokenize a specific input buffer based on
	** the tokenization rules supplied by a specific sqlite3_tokenizer
	** object.
	*/
	typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
	typedef struct sqlite3_tokenizer sqlite3_tokenizer;
	typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;

	struct sqlite3_tokenizer_module {

	/*
	** Structure version. Should always be set to 0.
	*/
	int iVersion;

	/*
	** Create a new tokenizer. The values in the argv[] array are the
	** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
	** TABLE statement that created the fts3 table. For example, if
	** the following SQL is executed:
	**
	** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
	**
	** then argc is set to 2, and the argv[] array contains pointers
	** to the strings "arg1" and "arg2".
	**
	** This method should return either SQLITE_OK (0), or an SQLite error
	** code. If SQLITE_OK is returned, then *ppTokenizer should be set
	** to point at the newly created tokenizer structure. The generic
	** sqlite3_tokenizer.pModule variable should not be initialised by
	** this callback. The caller will do so.
	*/
	int (*xCreate)(
	int argc, /* Size of argv array */
	const char constargv, /* Tokenizer argument strings */
	sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */
	);

	/*
	** Destroy an existing tokenizer. The fts3 module calls this method
	** exactly once for each successful call to xCreate().
	*/
	int (xDestroy)(sqlite3_tokenizer pTokenizer);

	/*
	** Create a tokenizer cursor to tokenize an input buffer. The caller
	** is responsible for ensuring that the input buffer remains valid
	** until the cursor is closed (using the xClose() method).
	*/
	int (*xOpen)(
	sqlite3_tokenizer pTokenizer, / Tokenizer object */
	const char pInput, int nBytes, / Input buffer */
	sqlite3_tokenizer_cursor *ppCursor / OUT: Created tokenizer cursor */
	);

	/*
	** Destroy an existing tokenizer cursor. The fts3 module calls this
	** method exactly once for each successful call to xOpen().
	*/
	int (xClose)(sqlite3_tokenizer_cursor pCursor);

	/*
	** Retrieve the next token from the tokenizer cursor pCursor. This
	** method should either return SQLITE_OK and set the values of the
	** "OUT" variables identified below, or SQLITE_DONE to indicate that
	** the end of the buffer has been reached, or an SQLite error code.
	**
	** *ppToken should be set to point at a buffer containing the
	** normalized version of the token (i.e. after any case-folding and/or
	** stemming has been performed). *pnBytes should be set to the length
	** of this buffer in bytes. The input text that generated the token is
	** identified by the byte offsets returned in *piStartOffset and
	** piEndOffset. piStartOffset should be set to the index of the first
	** byte of the token in the input buffer. *piEndOffset should be set
	** to the index of the first byte just past the end of the token in
	** the input buffer.
	**
	** The buffer *ppToken is set to point at is managed by the tokenizer
	** implementation. It is only required to be valid until the next call
	** to xNext() or xClose().
	*/
	/* TODO(shess) current implementation requires pInput to be
	** nul-terminated. This should either be fixed, or pInput/nBytes
	** should be converted to zInput.
	*/
	int (*xNext)(
	sqlite3_tokenizer_cursor pCursor, / Tokenizer cursor */
	const char *ppToken, int pnBytes, /* OUT: Normalized text for token */
	int piStartOffset, / OUT: Byte offset of token in input buffer */
	int piEndOffset, / OUT: Byte offset of end of token in input buffer */
	int piPosition / OUT: Number of tokens returned before this one */
	);
	};

	struct sqlite3_tokenizer {
	const sqlite3_tokenizer_module pModule; / The module for this tokenizer */
	/* Tokenizer implementations will typically add additional fields */
	};

	struct sqlite3_tokenizer_cursor {
	sqlite3_tokenizer pTokenizer; / Tokenizer for this cursor. */
	/* Tokenizer implementations will typically add additional fields */
	};

	int fts3_global_term_cnt(int iTerm, int iCol);
	int fts3_term_cnt(int iTerm, int iCol);


	#endif /* _FTS3_TOKENIZER_H_ */

	/************ End of fts3_tokenizer.h ************************************/

	/************ Begin file fts3_tokenizer1.c *******************************/
	/*
	** 2006 Oct 10
	**
	** The author disclaims copyright to this source code. In place of
	** a legal notice, here is a blessing:
	**
	** May you do good and not evil.
	** May you find forgiveness for yourself and forgive others.
	** May you share freely, never taking more than you give.
	**
	******************************************************************************
	**
	** Implementation of the "simple" full-text-search tokenizer.
	*/

	/*
	** The code in this file is only compiled if:
	**
	** * The FTS3 module is being built as an extension
	** (in which case SQLITE_CORE is not defined), or
	**
	** * The FTS3 module is being built into the core of
	** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
	*/
	#if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

	typedef struct simple_tokenizer {
	sqlite3_tokenizer base;
	char delim[128]; /* flag ASCII delimiters */
	} simple_tokenizer;

	typedef struct simple_tokenizer_cursor {
	sqlite3_tokenizer_cursor base;
	const char pInput; / input we are tokenizing */
	int nBytes; /* size of the input */
	int iOffset; /* current position in pInput */
	int iToken; /* index of next token to be returned */
	char pToken; / storage for current token */
	int nTokenAllocated; /* space allocated to zToken buffer */
	} simple_tokenizer_cursor;


	static int simpleDelim(simple_tokenizer *t, unsigned char c){
	return c<0x80 && t->delim[c];
	}

	/*
	** Create a new tokenizer instance.
	*/
	static int simpleCreate(
	int argc, const char * const *argv,
	sqlite3_tokenizer **ppTokenizer
	){
	simple_tokenizer *t;

	t = (simple_tokenizer ) sqlite3_malloc(sizeof(t));
	if( t==NULL ) return SQLITE_NOMEM;
	memset(t, 0, sizeof(*t));

	/* TODO(shess) Delimiters need to remain the same from run to run,
	** else we need to reindex. One solution would be a meta-table to
	** track such information in the database, then we'd only want this
	** information on the initial create.
	*/
	if( argc>1 ){
	int i, n = (int)strlen(argv[1]);
	for(i=0; i<n; i++){
	unsigned char ch = argv[1][i];
	/* We explicitly don't support UTF-8 delimiters for now. */
	if( ch>=0x80 ){
	sqlite3_free(t);
	return SQLITE_ERROR;
	}
	t->delim[ch] = 1;
	}
	} else {
	/* Mark whitespace as a delimiter */
	int i;
	for(i=1; i<0x80; i++){
	t->delim[i] = !isgraph(i);
	}
	t->delim[39] = 0; // apostrophe
	t->delim[42] = 1; // asterisk
	}

	*ppTokenizer = &t->base;
	return SQLITE_OK;
	}

	/*
	** Destroy a tokenizer
	*/
	static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
	sqlite3_free(pTokenizer);
	return SQLITE_OK;
	}

	/*
	** Prepare to begin tokenizing a particular string. The input
	** string to be tokenized is pInput[0..nBytes-1]. A cursor
	** used to incrementally tokenize this string is returned in
	** *ppCursor.
	*/
	static int simpleOpen(
	sqlite3_tokenizer pTokenizer, / The tokenizer */
	const char pInput, int nBytes, / String to be tokenized */
	sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */
	){
	simple_tokenizer_cursor *c;

	UNUSED_PARAMETER(pTokenizer);

	c = (simple_tokenizer_cursor ) sqlite3_malloc(sizeof(c));
	if( c==NULL ) return SQLITE_NOMEM;

	c->pInput = pInput;
	if( pInput==0 ){
	c->nBytes = 0;
	}else if( nBytes<0 ){
	c->nBytes = (int)strlen(pInput);
	}else{
	c->nBytes = nBytes;
	}
	c->iOffset = 0; /* start tokenizing at the beginning */
	c->iToken = 0;
	c->pToken = NULL; /* no space allocated, yet. */
	c->nTokenAllocated = 0;

	*ppCursor = &c->base;
	return SQLITE_OK;
	}

	/*
	** Close a tokenization cursor previously opened by a call to
	** simpleOpen() above.
	*/
	static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;
	sqlite3_free(c->pToken);
	sqlite3_free(c);
	return SQLITE_OK;
	}

	/*
	** Extract the next token from a tokenization cursor. The cursor must
	** have been opened by a prior call to simpleOpen().
	*/
	static int simpleNext(
	sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */
	const char *ppToken, / OUT: ppToken is the token text /
	int pnBytes, / OUT: Number of bytes in token */
	int piStartOffset, / OUT: Starting offset of token */
	int piEndOffset, / OUT: Ending offset of token */
	int piPosition / OUT: Position integer of token */
	){
	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;
	simple_tokenizer t = (simple_tokenizer ) pCursor->pTokenizer;
	unsigned char p = (unsigned char )c->pInput;

	while( c->iOffset<c->nBytes ){
	int iStartOffset;

	/* Scan past delimiter characters */
	while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
	c->iOffset++;
	}

	/* Count non-delimiter characters. */
	iStartOffset = c->iOffset;
	while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
	c->iOffset++;
	}

	if( c->iOffset>iStartOffset ){
	int i, n = c->iOffset-iStartOffset;
	if( n>c->nTokenAllocated ){
	c->nTokenAllocated = n+20;
	c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
	if( c->pToken==NULL ) return SQLITE_NOMEM;
	}
	for(i=0; i<n; i++){
	/* TODO(shess) This needs expansion to handle UTF-8
	** case-insensitivity.
	*/
	unsigned char ch = p[iStartOffset+i];
	c->pToken[i] = (char)(ch<0x80 ? tolower(ch) : ch);
	}
	*ppToken = c->pToken;
	*pnBytes = n;
	*piStartOffset = iStartOffset;
	*piEndOffset = c->iOffset;
	*piPosition = c->iToken++;

	return SQLITE_OK;
	}
	}
	return SQLITE_DONE;
	}

	/*
	** The set of routines that implement the simple tokenizer
	*/
	static const sqlite3_tokenizer_module simpleTokenizerModule = {
	0,
	simpleCreate,
	simpleDestroy,
	simpleOpen,
	simpleClose,
	simpleNext,
	};

	/*
	** Allocate a new simple tokenizer. Return a pointer to the new
	** tokenizer in *ppModule
	*/
	SQLITE_PRIVATE void sqlite3Fts3SimpleTokenizerModule(
	sqlite3_tokenizer_module const**ppModule
	){
	*ppModule = &simpleTokenizerModule;
	}

	#endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

	/************ End of fts3_tokenizer1.c ***********************************/

	/* Our own code */
	void* get_tokenizer_ptr() {
	sqlite3_tokenizer_module *p;
	sqlite3Fts3SimpleTokenizerModule(&p);
	return (void *)p;
	}
	END


	use Data::Dump 'dump';
	use DBD::SQLite;
	use DBI ':sql_types';
	use Test::More tests => 4;

	my $dbh = DBI->connect(
	'dbi:SQLite:dbname=:memory:',
	'',
	'',
	{
	sqlite_unicode => 1,
	RaiseError => 1,
	},
	);

	my $ptr = get_tokenizer_ptr();
	my $pptr = pack('L!', $ptr);

	my $sth = $dbh->prepare('SELECT fts3_tokenizer(?, ?)');
	$sth->bind_param(1, 'nonwhitespace');
	$sth->bind_param(2, $pptr, SQL_BLOB);
	$sth->execute();

	# simple
	$dbh->do('CREATE VIRTUAL TABLE foo1 USING fts3()');
	$dbh->do("INSERT INTO foo1 VALUES('bar baz: : quux')");
	my $first = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '"baz*"'});
	is($first, 'bar baz: : quux', 'simple tokenizer finds alphanumeric token');
	my $second = $dbh->selectrow_array(qq{SELECT * FROM foo1 WHERE content MATCH '":*"'});
	is($second, undef, "simple tokenizer doesn't find non-alphanumeric token");

	# nonwhitespace
	$dbh->do('CREATE VIRTUAL TABLE foo2 USING fts3(tokenize=nonwhitespace)');
	$dbh->do("INSERT INTO foo2 VALUES('bar baz: : quux')");
	$first = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '"baz*"'});
	is($first, 'bar baz: : quux', 'nonwhitespace finds alphanumeric token');
	$second = $dbh->selectrow_array(qq{SELECT * FROM foo2 WHERE content MATCH '":*"'});
	is($first, 'bar baz: : quux', 'nonwhitespace finds non-alphanumeric token');