Skip to content

Instantly share code, notes, and snippets.

@fffonion
Last active September 22, 2017 03:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fffonion/8ff610345e81682c01ee53ea8e80df66 to your computer and use it in GitHub Desktop.
Save fffonion/8ff610345e81682c01ee53ea8e80df66 to your computer and use it in GitHub Desktop.
Sphinx 2.2, 2.3 w/ mmseg patch (coreseek) https://yooooo.us/2017/sphinx-with-mmseg-based-on-coreseek
diff --git a/.gitignore b/.gitignore
index f5be264..518fcc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,10 @@
/test/ql/data/*.lock
/test/ql/*.class
/test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
fi
])
+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+ do
+ if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+ then
+ MMSEG_CFLAGS="-I$CANDIDATE"
+ break
+ fi
+ done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+ for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+ "/usr/local/lib" "/usr/local/mmseg/lib" \
+ "/usr/local/lib/mmseg" "/usr/lib" \
+ "/opt/mmseg/lib"
+ do
+ if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+ then
+ MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+ break
+ fi
+ done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+ AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+ [ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+ MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+ AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+ [ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+ # Trim trailing '.libs' if user passed it in --with-mysql-libs option
+ ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+ -e 's+.libs/$++'`
+ MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+ AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
dnl ---------------------------------------------------------------------------
dnl Macro: AC_CHECK_LIBSTEMMER
dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 643f5ca..e9a961b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,6 +66,7 @@ fi
AC_PROG_CC
AC_PROG_CXX
+AM_PROG_AR
AC_PROG_RANLIB
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -426,6 +427,24 @@ else
fi
AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )
+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+ AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+ [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+ AC_MSG_RESULT([yes])
+ AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+ AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+ AC_SUBST([MMSEG_LIBS])
+ AC_SUBST([MMSEG_CFLAGS])
+else
+ AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
# add macports include directory
if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -480,7 +499,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
dnl ---
# we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"
AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
if USE_LIBSTEMMER
noinst_LIBRARIES = libstemmer.a
include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index 048a112..9197000 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
endif
AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 7f294f6..7ba641e 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1768,7 +1768,7 @@ int main ( int argc, char ** argv )
"\n"
"Options are:\n"
"--config <file>\t\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--all\t\t\treindex all configured indexes\n"
"--quiet\t\t\tbe quiet, only print errors\n"
"--verbose\t\tverbose indexing issues report\n"
@@ -1797,8 +1797,8 @@ int main ( int argc, char ** argv )
"--keep-attrs\t\tretain attributes from the old index"
"\n"
"Examples:\n"
- "indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
- "indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+ "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+ "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
}
return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index 85b1cd6..28ef919 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -22030,7 +22030,7 @@ void ShowHelp ()
"Options are:\n"
"-h, --help\t\tdisplay this help message\n"
"-c, --config <file>\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--stop\t\t\tsend SIGTERM to currently running searchd\n"
"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
"--status\t\tget ant print status variables\n"
@@ -22067,9 +22067,9 @@ void ShowHelp ()
"--safetrace\t\tonly use system backtrace() call in crash reports\n"
"\n"
"Examples:\n"
- "searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+ "searchd --config /usr/local/sphinx/etc/csft.conf\n"
#if USE_WINDOWS
- "searchd --install --config c:\\sphinx\\sphinx.conf\n"
+ "searchd --install --config c:\\sphinx\\csft.conf\n"
#endif
);
}
@@ -23338,12 +23338,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
while ( !g_sConfigFile.cstr() )
{
#ifdef SYSCONFDIR
- g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+ g_sConfigFile = SYSCONFDIR "/";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
#endif
- g_sConfigFile = "./sphinx.conf";
+ g_sConfigFile = "./";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
@@ -23354,9 +23354,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
if ( !g_sConfigFile.cstr () )
sphFatal ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)." );
+ "./csft.conf)." );
sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index d6a7b9d..73e92b0 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -143,6 +143,16 @@
#pragma message("Automatically linking with btutils.lib")
#endif
+#if ( USE_WINDOWS && USE_MMSEG )
+ #if _DEBUG
+ #pragma comment(linker, "/defaultlib:libcss_d.lib")
+ #else
+ #pragma comment(linker, "/defaultlib:libcss.lib")
+ #endif
+ #pragma message("Automatically linking with libcss.lib")
+ #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
/////////////////////////////////////////////////////////////////////////////
// logf() is not there sometimes (eg. Solaris 9)
@@ -2550,10 +2560,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
public:
CSphTokenizer_UTF8 ();
virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual bool IsPreTokenized() { return m_bPreTokenized; }
virtual BYTE * GetToken ();
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
virtual int GetCodepointLength ( int iCode ) const;
virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+ bool m_bPreTokenized;
};
@@ -2574,6 +2588,78 @@ protected:
CSphString m_sNgramCharsStr;
};
+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+ CSphTokenizer_UTF8MMSeg ();
+ ~CSphTokenizer_UTF8MMSeg() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ }
+
+ virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual BYTE * GetToken ();
+ virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
+ virtual const BYTE* GetThesaurus(BYTE * sBuffer, int iLength );
+ bool IsSegment(const BYTE * pCur);
+
+ CSphTokenizerBase* SetDictPath(const char* path) { m_dictpath = path; return this; }
+
+ virtual const char * GetBufferPtr () const { return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; }
+
+ virtual const char * GetTokenStart () const { return m_segToken; }
+
+ virtual int GetLastTokenLen () const { return m_iLastTokenLenMMSeg; }
+
+ virtual void ReloadSegDictionary() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ m_seg = NULL;
+
+ if(m_mgr) {
+ SegmenterManagerSingleInstance::Free(); // free preexist instance.
+ m_mgr = NULL;
+ }
+ }
+protected:
+ char* m_segToken;
+ size_t m_segoffset;
+ int m_iLastTokenLenMMSeg;
+ BYTE m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator
+ BYTE * m_pAccumSeg; ///< current accumulator position
+ CSphVector<u2> m_tokenlens;
+ int m_tokenpos;
+protected:
+ // virtual bool IsSegment(const BYTE * pCur);
+ CSphString m_dictpath;
+
+ // mmseg related
+ css::Segmenter* m_seg;
+ css::SegmenterManager* m_mgr;
+ css::Segmenter* GetSegmenter(const char* dict_path){
+ int nRet = 0;
+ if(!m_mgr) {
+ m_mgr = SegmenterManagerSingleInstance::Get();
+ if(dict_path)
+ nRet = m_mgr->init(dict_path);
+ }
+ if(nRet == 0 && !m_seg)
+ m_seg = m_mgr->getSegmenter(false);
+ return m_seg;
+ }
+};
+
+#endif
struct CSphNormalForm
{
@@ -3794,6 +3880,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
return new CSphTokenizer_UTF8Ngram<false> ();
}
+#if USE_MMSEG
+ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+ CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+ tokenizer->SetDictPath(dict_path);
+ return tokenizer;
+}
+#endif
+
/////////////////////////////////////////////////////////////////////////////
enum
@@ -4379,6 +4474,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
: m_iType ( TOKENIZER_UTF8 )
, m_iMinWordLen ( 1 )
, m_iNgramLen ( 0 )
+ , m_iDebug ( 0 )
{
}
@@ -4390,7 +4486,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
return true;
tSettings.m_iType = tReader.GetByte ();
- if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
{
sWarning = "can't load an old index with SBCS tokenizer";
return false;
@@ -4418,7 +4518,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
tSettings.m_sIgnoreChars = tReader.GetString ();
tSettings.m_iNgramLen = tReader.GetDword ();
tSettings.m_sNgramChars = tReader.GetString ();
- if ( uVersion>=15 )
+#if USE_MMSEG
+ //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+ tSettings.m_sDictPath = tReader.GetString ();
+#endif
+ if ( uVersion>=15 )
tSettings.m_sBlendChars = tReader.GetString ();
if ( uVersion>=24 )
tSettings.m_sBlendMode = tReader.GetString();
@@ -4449,6 +4553,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
tWriter.PutDword ( tSettings.m_iNgramLen );
tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+ // if turn mmseg off, the index(s) are compat again.
+ tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
}
@@ -4723,6 +4831,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
{
case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break;
case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+ case TOKENIZER_ZHCN_UTF8: pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
default:
sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
return NULL;
@@ -5968,7 +6079,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
{
CSphString sTmp;
SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
- m_bHasBlend = false;
+
+ // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+ // Here just make CJK Charactor will remain. --coreseek
+ // 4e00 - 9fff CJK unified ideographs
+ // 3000 - 303f CJK symbols and punctuation
+ // 3040 - 30ff Hiragana/Katagana
+ // ff00 - ffff half/fullwidth forms
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+ // ENDCJK
+ m_bPreTokenized = false; // by default use original route.
+
+ m_bHasBlend = false;
}
@@ -5978,10 +6106,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
// check that old one is over and that new length is sane
assert ( iLength>=0 );
- // set buffer
+ // set buffer
m_pBuffer = sBuffer;
+ // check is pre-segment buffer, with prefix 0xFFFA
+ // if True, the following should be 0xFFA, 0x41, [ctx] --coreseek
+ m_bPreTokenized = false;
+ if(iLength > 4)
+ {
+ // there is a ' ' (space, 32) as padding. might not true
+ unsigned char mask[] = {32, 239, 191, 186, 65};
+ unsigned char mask_bare[] = {239, 191, 186, 65};
+ if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+ // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+ m_bPreTokenized = true;
+ m_pBuffer += 5;
+ }else
+ if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+ m_bPreTokenized = true;
+ m_pBuffer += 4;
+ }
+ }
+
m_pBufferMax = sBuffer + iLength;
- m_pCur = sBuffer;
+ m_pCur = m_pBuffer;
m_pTokenStart = m_pTokenEnd = NULL;
m_pBlendStart = m_pBlendEnd = NULL;
@@ -5999,7 +6146,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
m_bTokenBoundary = false;
m_bWasSynonym = false;
- return m_bHasBlend
+ return m_bHasBlend
? DoGetToken<IS_QUERY,true>()
: DoGetToken<IS_QUERY,false>();
}
@@ -6414,6 +6561,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
assert ( m_iNgramLen==1 );
return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
}
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+ :CSphTokenizer_UTF8<IS_QUERY>()
+ , m_segoffset(0)
+{
+ //over ride charmap
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+ m_pAccumSeg = m_sAccumSeg;
+ //m_iLastTokenBufferLen = 0;
+ m_iLastTokenLenMMSeg = 0;
+
+ m_mgr = NULL;
+ m_seg = NULL;
+ m_tokenlens.Reserve(1024*512); // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+ CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+ else
+ sphDie ( " Tokenizer initialization failure. " );
+ m_segoffset = 0;
+ m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+ m_tokenlens.Reset();
+ m_tokenpos = 0;
+ {
+ u2 len = 0, symlen = 0;
+ while(1){
+ len = 0;
+ char* tok = (char*)seg->peekToken(len,symlen);
+ if(!tok || !*tok || !len)
+ break;
+ seg->popToken(len);
+
+ m_tokenlens.Add(len);
+ //printf("%*.*s/p ",symlen,symlen,tok);
+ }
+ }
+}
+
+template < bool IS_QUERY >
+bool CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+ // this code might have bug, but as it will removed in next release...
+ size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+ //if(offset == 0) return false;
+ //printf("pcur: %s\n", pCur);
+
+ //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+ {
+ u2 len = 0, symlen = 0;
+ while(m_segoffset < offset) {
+ //tok = (const char*)seg->peekToken(len, symlen);
+ //seg->popToken(len);
+ len = m_tokenlens[m_tokenpos];
+ m_tokenpos ++;
+ m_segoffset += len;
+ //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+ if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+ //break?
+ break;
+ }
+ }
+ /*
+ printf("\n");
+ printf("seg_off %d vs off %d\n", m_segoffset, offset);
+ if(m_segoffset != offset)
+ printf("seg_pcur: %s\n", pCur);
+ */
+ return (m_segoffset == offset);
+ } //end if seg
+ return true;
+}
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+ //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ m_iLastTokenLenMMSeg = 0;
+ //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+ while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+ {
+ BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ //printf("utf8_token: %s \t ", tok);
+ if(!tok){
+ m_iLastTokenLenMMSeg = 0;
+ return NULL;
+ }
+
+ int token_buf_len = strlen((const char*)tok);
+
+ if(m_pAccumSeg == m_sAccumSeg)
+ m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+ if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN ) {
+ ::memcpy(m_pAccumSeg, tok, token_buf_len);
+ m_pAccumSeg += token_buf_len;
+ m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+ }
+ }
+ {
+ *m_pAccumSeg = 0;
+ //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+ m_pAccumSeg = m_sAccumSeg;
+
+ return m_sAccumSeg;
+ }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+ CSphTokenizerBase * pClone;
+ if ( eMode!=SPH_CLONE_INDEX ) {
+ pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+ }else{
+ pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+ }
+ pClone->CloneBase ( this, eMode );
+ return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+ return NULL;
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
@@ -9683,6 +9976,7 @@ void CSphIndex::SetupQueryTokenizer()
// create and setup a master copy of query time tokenizer
// that we can then use to create lightweight clones
SafeDelete ( m_pQueryTokenizer );
+ m_pTokenizer->ReloadSegDictionary();
m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
if ( IsStarDict() )
{
@@ -25994,6 +26288,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
m_bIndexExactWords = tSettings.m_bIndexExactWords;
m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+ m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
m_bIndexSP = tSettings.m_bIndexSP;
m_dPrefixFields = tSettings.m_dPrefixFields;
m_dInfixFields = tSettings.m_dInfixFields;
@@ -26599,11 +26894,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
{
+ //FIXME: dump token to console --coreseek
+ //debug dump
+ if(m_pTokenizer->DumpToken()) {
+ printf("%s_x ", sWord); // make the same as pre-tokenized text.
+ }
+
+ // fix sWork if in pre-tokenized mode.
+ int iBytes = strlen ( (const char*)sWord );
+ bool bAdvancePos = true;
+ if(m_pTokenizer->IsPreTokenized()) {
+ // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+ if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+ bAdvancePos = false; // not an advance token.
+ sWord[iBytes-2] = '\0'; // change token_x -> token\0x
+ iBytes -= 2; // decrease length
+ }
+
m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
- if ( !bPayload )
+ if ( !bPayload && bAdvancePos)
{
HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
if ( m_pTokenizer->GetBoundary() )
@@ -26615,7 +26927,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
if ( bGlobalPartialMatch )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD;
sBuf[iBytes+1] = '\0';
@@ -26625,7 +26937,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
sBuf[iBytes+1] = '\0';
@@ -26661,6 +26973,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
} else
m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+ // works only when mmseg is on.
+ // zh_cn only GetThesaurus
+ {
+ int iBytes = strlen ( (const char*)sWord );
+ const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+ if(tbuf_ptr) {
+ while(*tbuf_ptr) {
+ size_t len = strlen((const char*)tbuf_ptr);
+ SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+ if ( iWord ) {
+ m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+ // mmseg; do not inc step for we are in 'one' hit.
+ //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+ }
+ tbuf_ptr += len + 1; //move next
+ }
+ }
+ //end if buf
+ }//end GetThesaurus
+#endif
}
m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index a8f16ca..07453bc 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
#define USE_RE2 0 /// whether to compile RE2 support
#define USE_RLP 0 /// whether to compile RLP support
#define USE_WINDOWS 1 /// whether to compile for Windows
+ #define USE_MMSEG 1 /// enable mmseg
#define USE_SYSLOG 0 /// whether to use syslog for logging
#define HAVE_STRNLEN 1
@@ -208,7 +209,7 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPHINX_VERSION_NUMBERS "2.2.11"
#define SPHINX_VERSION SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
-#define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
@@ -216,6 +217,10 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPH_MAX_FILENAME_LEN 512
#define SPH_MAX_FIELDS 256
+#define CORESEEK_BANNER "Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
@@ -499,7 +504,10 @@ struct CSphTokenizerSettings
CSphString m_sBlendChars;
CSphString m_sBlendMode;
CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output
-
+ int m_iDebug; ///< is in tokenizer debug mode.
+#if USE_MMSEG
+ CSphString m_sDictPath; ///coreseek: where to find segmentor's dict.
+#endif
CSphTokenizerSettings ();
};
@@ -610,11 +618,16 @@ public:
/// get synonym file info
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
+ /// mark as debug tokenizer's output --coreseek -mmseg
+ virtual int DumpToken () { return m_tSettings.m_iDebug; }
public:
/// pass next buffer
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
+ /// is pre-tokenized --coreseek
+ virtual bool IsPreTokenized() { return false; }
+
/// set current index schema (only intended for the token filter plugins)
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
@@ -697,6 +710,10 @@ public:
/// set new buffer ptr (must be within current bounds)
virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
+#if USE_MMSEG
+ virtual const BYTE* GetThesaurus(BYTE * , int ) { return NULL; }
+ virtual void ReloadSegDictionary() { return; } // reload mmseg's dictionary.
+#endif
/// get settings hash
virtual uint64_t GetSettingsFNV () const;
@@ -721,6 +738,9 @@ protected:
CSphLowercaser m_tLC; ///< my lowercaser
int m_iLastTokenLen; ///< last token length, in codepoints
bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+ int m_iLastTokenBufferLen; ///< the buffer length -- coreseek; use in mmseg patch.
+#endif
bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint)
int m_iBoundaryOffset; ///< boundary character offset (in bytes)
bool m_bWasSpecial; ///< special token flag
@@ -1826,6 +1846,7 @@ struct CSphSourceSettings
int m_iStopwordStep; ///< position step on stopword token (default is 1)
bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters
bool m_bIndexFieldLens; ///< whether to index field lengths
+ int m_bDebugDump; ///< mmseg charset debug output feature
CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields
CSphVector<CSphString> m_dInfixFields; ///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 13ed63a..05e7b2e 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -530,6 +530,8 @@ static KeyDesc_t g_dKeysIndex[] =
{ "min_word_len", 0, NULL },
{ "charset_type", KEY_REMOVED, NULL },
{ "charset_table", 0, NULL },
+ { "charset_dictpath", 0, NULL }, //coreseek: mmseg's dictionary path
+ { "charset_debug", 0, NULL }, //coreseek: debug output tokens
{ "ignore_chars", 0, NULL },
{ "min_prefix_len", 0, NULL },
{ "min_infix_len", 0, NULL },
@@ -1256,7 +1258,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
{
tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );
- if ( hIndex ( "ngram_chars" ) )
+ if(hIndex("charset_debug"))
+ tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+ if ( hIndex ( "ngram_chars" ) )
{
if ( tSettings.m_iNgramLen )
tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1264,6 +1269,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
}
+#if USE_MMSEG
+ //XXX:fixme : sphinx changes tokenizer create process
+ if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+ {
+ tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+ tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+ }
+#endif
+
tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1397,6 +1411,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+ tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;
// prefix/infix fields
CSphString sFields;
@@ -1697,12 +1712,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
while ( !sOptConfig )
{
#ifdef SYSCONFDIR
- sOptConfig = SYSCONFDIR "/sphinx.conf";
+ sOptConfig = SYSCONFDIR "/csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
#endif
- sOptConfig = "./sphinx.conf";
+ sOptConfig = "./csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
@@ -1713,9 +1728,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
if ( !sOptConfig )
sphDie ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)" );
+ "./csft.conf)" );
if ( !bQuiet )
fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 9542afd..9a8a625 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,7 @@ enum
// where was TOKENIZER_SBCS=1 once
TOKENIZER_UTF8 = 2,
TOKENIZER_NGRAM = 3
+ , TOKENIZER_ZHCN_UTF8 = 4
};
/// load config file
diff --git a/.gitignore b/.gitignore
index 9c2c126..5ff0e50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,3 +63,10 @@
/test/ql/data/*.lock
/test/ql/*.class
/test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
fi
])
+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+ do
+ if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+ then
+ MMSEG_CFLAGS="-I$CANDIDATE"
+ break
+ fi
+ done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+ for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+ "/usr/local/lib" "/usr/local/mmseg/lib" \
+ "/usr/local/lib/mmseg" "/usr/lib" \
+ "/opt/mmseg/lib"
+ do
+ if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+ then
+ MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+ break
+ fi
+ done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+ AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+ [ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+ MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+ AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+ [ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+ # Trim trailing '.libs' if user passed it in --with-mysql-libs option
+ ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+ -e 's+.libs/$++'`
+ MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+ AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
dnl ---------------------------------------------------------------------------
dnl Macro: AC_CHECK_LIBSTEMMER
dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 96fa3b4..f614a10 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,6 +66,7 @@ fi
AC_PROG_CC
AC_PROG_CXX
+AM_PROG_AR
AC_PROG_RANLIB
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -426,6 +427,24 @@ else
fi
AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )
+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+ AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+ [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+ AC_MSG_RESULT([yes])
+ AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+ AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+ AC_SUBST([MMSEG_LIBS])
+ AC_SUBST([MMSEG_CFLAGS])
+else
+ AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
# add macports include directory
if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -480,7 +499,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
dnl ---
# we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"
AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
if USE_LIBSTEMMER
noinst_LIBRARIES = libstemmer.a
include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index 63b7d8f..3a1ba55 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
endif
AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index fa2296d..00a2996 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1766,7 +1766,7 @@ int main ( int argc, char ** argv )
"\n"
"Options are:\n"
"--config <file>\t\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--all\t\t\treindex all configured indexes\n"
"--quiet\t\t\tbe quiet, only print errors\n"
"--verbose\t\tverbose indexing issues report\n"
@@ -1795,8 +1795,8 @@ int main ( int argc, char ** argv )
"--keep-attrs\t\tretain attributes from the old index"
"\n"
"Examples:\n"
- "indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
- "indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+ "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+ "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
}
return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index 43d2ab8..6619e69 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -20772,7 +20772,7 @@ void ShowHelp ()
"Options are:\n"
"-h, --help\t\tdisplay this help message\n"
"-c, --config <file>\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--stop\t\t\tsend SIGTERM to currently running searchd\n"
"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
"--status\t\tget ant print status variables\n"
@@ -20809,9 +20809,9 @@ void ShowHelp ()
"--safetrace\t\tonly use system backtrace() call in crash reports\n"
"\n"
"Examples:\n"
- "searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+ "searchd --config /usr/local/sphinx/etc/csft.conf\n"
#if USE_WINDOWS
- "searchd --install --config c:\\sphinx\\sphinx.conf\n"
+ "searchd --install --config c:\\sphinx\\csft.conf\n"
#endif
);
}
@@ -23833,12 +23833,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
while ( !g_sConfigFile.cstr() )
{
#ifdef SYSCONFDIR
- g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+ g_sConfigFile = SYSCONFDIR "/";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
#endif
- g_sConfigFile = "./sphinx.conf";
+ g_sConfigFile = "./";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
@@ -23849,9 +23849,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
if ( !g_sConfigFile.cstr () )
sphFatal ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)." );
+ "./csft.conf)." );
sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index c63293f..3df0d2f 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -142,6 +142,16 @@
#pragma message("Automatically linking with btutils.lib")
#endif
+#if ( USE_WINDOWS && USE_MMSEG )
+ #if _DEBUG
+ #pragma comment(linker, "/defaultlib:libcss_d.lib")
+ #else
+ #pragma comment(linker, "/defaultlib:libcss.lib")
+ #endif
+ #pragma message("Automatically linking with libcss.lib")
+ #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
/////////////////////////////////////////////////////////////////////////////
// logf() is not there sometimes (eg. Solaris 9)
@@ -2556,10 +2566,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
public:
CSphTokenizer_UTF8 ();
virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual bool IsPreTokenized() { return m_bPreTokenized; }
virtual BYTE * GetToken ();
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
virtual int GetCodepointLength ( int iCode ) const;
virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+ bool m_bPreTokenized;
};
@@ -2580,6 +2594,78 @@ protected:
CSphString m_sNgramCharsStr;
};
+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+ CSphTokenizer_UTF8MMSeg ();
+ ~CSphTokenizer_UTF8MMSeg() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ }
+
+ virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual BYTE * GetToken ();
+ virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
+ virtual const BYTE* GetThesaurus(BYTE * sBuffer, int iLength );
+ bool IsSegment(const BYTE * pCur);
+
+ CSphTokenizerBase* SetDictPath(const char* path) { m_dictpath = path; return this; }
+
+ virtual const char * GetBufferPtr () const { return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; }
+
+ virtual const char * GetTokenStart () const { return m_segToken; }
+
+ virtual int GetLastTokenLen () const { return m_iLastTokenLenMMSeg; }
+
+ virtual void ReloadSegDictionary() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ m_seg = NULL;
+
+ if(m_mgr) {
+ SegmenterManagerSingleInstance::Free(); // free preexist instance.
+ m_mgr = NULL;
+ }
+ }
+protected:
+ char* m_segToken;
+ size_t m_segoffset;
+ int m_iLastTokenLenMMSeg;
+ BYTE m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator
+ BYTE * m_pAccumSeg; ///< current accumulator position
+ CSphVector<u2> m_tokenlens;
+ int m_tokenpos;
+protected:
+ // virtual bool IsSegment(const BYTE * pCur);
+ CSphString m_dictpath;
+
+ // mmseg related
+ css::Segmenter* m_seg;
+ css::SegmenterManager* m_mgr;
+ css::Segmenter* GetSegmenter(const char* dict_path){
+ int nRet = 0;
+ if(!m_mgr) {
+ m_mgr = SegmenterManagerSingleInstance::Get();
+ if(dict_path)
+ nRet = m_mgr->init(dict_path);
+ }
+ if(nRet == 0 && !m_seg)
+ m_seg = m_mgr->getSegmenter(false);
+ return m_seg;
+ }
+};
+
+#endif
struct CSphNormalForm
{
@@ -3798,6 +3884,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
return new CSphTokenizer_UTF8Ngram<false> ();
}
+#if USE_MMSEG
+ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+ CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+ tokenizer->SetDictPath(dict_path);
+ return tokenizer;
+}
+#endif
+
/////////////////////////////////////////////////////////////////////////////
enum
@@ -4383,6 +4478,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
: m_iType ( TOKENIZER_UTF8 )
, m_iMinWordLen ( 1 )
, m_iNgramLen ( 0 )
+ , m_iDebug ( 0 )
{
}
@@ -4394,7 +4490,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
return true;
tSettings.m_iType = tReader.GetByte ();
- if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
{
sWarning = "can't load an old index with SBCS tokenizer";
return false;
@@ -4422,7 +4522,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
tSettings.m_sIgnoreChars = tReader.GetString ();
tSettings.m_iNgramLen = tReader.GetDword ();
tSettings.m_sNgramChars = tReader.GetString ();
- if ( uVersion>=15 )
+#if USE_MMSEG
+ //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+ tSettings.m_sDictPath = tReader.GetString ();
+#endif
+ if ( uVersion>=15 )
tSettings.m_sBlendChars = tReader.GetString ();
if ( uVersion>=24 )
tSettings.m_sBlendMode = tReader.GetString();
@@ -4453,6 +4557,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
tWriter.PutDword ( tSettings.m_iNgramLen );
tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+ // if turn mmseg off, the index(s) are compat again.
+ tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
}
@@ -4727,6 +4835,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
{
case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break;
case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+ case TOKENIZER_ZHCN_UTF8: pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
default:
sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
return NULL;
@@ -5966,7 +6077,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
{
CSphString sTmp;
SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
- m_bHasBlend = false;
+
+ // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+ // Here just make CJK Charactor will remain. --coreseek
+ // 4e00 - 9fff CJK unified ideographs
+ // 3000 - 303f CJK symbols and punctuation
+ // 3040 - 30ff Hiragana/Katagana
+ // ff00 - ffff half/fullwidth forms
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+ // ENDCJK
+ m_bPreTokenized = false; // by default use original route.
+
+ m_bHasBlend = false;
}
@@ -5976,10 +6104,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
// check that old one is over and that new length is sane
assert ( iLength>=0 );
- // set buffer
+ // set buffer
m_pBuffer = sBuffer;
+ // check is pre-segment buffer, with prefix 0xFFFA
+ // if True, the following should be 0xFFA, 0x41, [ctx] --coreseek
+ m_bPreTokenized = false;
+ if(iLength > 4)
+ {
+ // there is a ' ' (space, 32) as padding. might not true
+ unsigned char mask[] = {32, 239, 191, 186, 65};
+ unsigned char mask_bare[] = {239, 191, 186, 65};
+ if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+ // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+ m_bPreTokenized = true;
+ m_pBuffer += 5;
+ }else
+ if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+ m_bPreTokenized = true;
+ m_pBuffer += 4;
+ }
+ }
+
m_pBufferMax = sBuffer + iLength;
- m_pCur = sBuffer;
+ m_pCur = m_pBuffer;
m_pTokenStart = m_pTokenEnd = NULL;
m_pBlendStart = m_pBlendEnd = NULL;
@@ -5997,7 +6144,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
m_bTokenBoundary = false;
m_bWasSynonym = false;
- return m_bHasBlend
+ return m_bHasBlend
? DoGetToken<IS_QUERY,true>()
: DoGetToken<IS_QUERY,false>();
}
@@ -6412,6 +6559,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
assert ( m_iNgramLen==1 );
return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
}
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+ :CSphTokenizer_UTF8<IS_QUERY>()
+ , m_segoffset(0)
+{
+ //over ride charmap
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+ m_pAccumSeg = m_sAccumSeg;
+ //m_iLastTokenBufferLen = 0;
+ m_iLastTokenLenMMSeg = 0;
+
+ m_mgr = NULL;
+ m_seg = NULL;
+ m_tokenlens.Reserve(1024*512); // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+ CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+ else
+ sphDie ( " Tokenizer initialization failure. " );
+ m_segoffset = 0;
+ m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+ m_tokenlens.Reset();
+ m_tokenpos = 0;
+ {
+ u2 len = 0, symlen = 0;
+ while(1){
+ len = 0;
+ char* tok = (char*)seg->peekToken(len,symlen);
+ if(!tok || !*tok || !len)
+ break;
+ seg->popToken(len);
+
+ m_tokenlens.Add(len);
+ //printf("%*.*s/p ",symlen,symlen,tok);
+ }
+ }
+}
+
+template < bool IS_QUERY >
+bool CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+ // this code might have bug, but as it will removed in next release...
+ size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+ //if(offset == 0) return false;
+ //printf("pcur: %s\n", pCur);
+
+ //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+ {
+ u2 len = 0, symlen = 0;
+ while(m_segoffset < offset) {
+ //tok = (const char*)seg->peekToken(len, symlen);
+ //seg->popToken(len);
+ len = m_tokenlens[m_tokenpos];
+ m_tokenpos ++;
+ m_segoffset += len;
+ //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+ if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+ //break?
+ break;
+ }
+ }
+ /*
+ printf("\n");
+ printf("seg_off %d vs off %d\n", m_segoffset, offset);
+ if(m_segoffset != offset)
+ printf("seg_pcur: %s\n", pCur);
+ */
+ return (m_segoffset == offset);
+ } //end if seg
+ return true;
+}
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+ //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ m_iLastTokenLenMMSeg = 0;
+ //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+ while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+ {
+ BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ //printf("utf8_token: %s \t ", tok);
+ if(!tok){
+ m_iLastTokenLenMMSeg = 0;
+ return NULL;
+ }
+
+ int token_buf_len = strlen((const char*)tok);
+
+ if(m_pAccumSeg == m_sAccumSeg)
+ m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+ if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN ) {
+ ::memcpy(m_pAccumSeg, tok, token_buf_len);
+ m_pAccumSeg += token_buf_len;
+ m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+ }
+ }
+ {
+ *m_pAccumSeg = 0;
+ //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+ m_pAccumSeg = m_sAccumSeg;
+
+ return m_sAccumSeg;
+ }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+ CSphTokenizerBase * pClone;
+ if ( eMode!=SPH_CLONE_INDEX ) {
+ pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+ }else{
+ pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+ }
+ pClone->CloneBase ( this, eMode );
+ return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+ return NULL;
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
@@ -9678,6 +9971,7 @@ void CSphIndex::SetupQueryTokenizer()
// create and setup a master copy of query time tokenizer
// that we can then use to create lightweight clones
SafeDelete ( m_pQueryTokenizer );
+ m_pTokenizer->ReloadSegDictionary();
m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
if ( IsStarDict() )
{
@@ -25691,6 +25985,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
m_bIndexExactWords = tSettings.m_bIndexExactWords;
m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+ m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
m_bIndexSP = tSettings.m_bIndexSP;
m_dPrefixFields = tSettings.m_dPrefixFields;
m_dInfixFields = tSettings.m_dInfixFields;
@@ -26295,11 +26590,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
{
+ //FIXME: dump token to console --coreseek
+ //debug dump
+ if(m_pTokenizer->DumpToken()) {
+ printf("%s_x ", sWord); // make the same as pre-tokenized text.
+ }
+
+ // fix sWork if in pre-tokenized mode.
+ int iBytes = strlen ( (const char*)sWord );
+ bool bAdvancePos = true;
+ if(m_pTokenizer->IsPreTokenized()) {
+ // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+ if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+ bAdvancePos = false; // not an advance token.
+ sWord[iBytes-2] = '\0'; // change token_x -> token\0x
+ iBytes -= 2; // decrease length
+ }
+
m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
- if ( !bPayload )
+ if ( !bPayload && bAdvancePos)
{
HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
if ( m_pTokenizer->GetBoundary() )
@@ -26311,7 +26623,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
if ( bGlobalPartialMatch )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD;
sBuf[iBytes+1] = '\0';
@@ -26321,7 +26633,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
sBuf[iBytes+1] = '\0';
@@ -26357,6 +26669,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
} else
m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+ // works only when mmseg is on.
+ // zh_cn only GetThesaurus
+ {
+ int iBytes = strlen ( (const char*)sWord );
+ const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+ if(tbuf_ptr) {
+ while(*tbuf_ptr) {
+ size_t len = strlen((const char*)tbuf_ptr);
+ SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+ if ( iWord ) {
+ m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+ // mmseg; do not inc step for we are in 'one' hit.
+ //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+ }
+ tbuf_ptr += len + 1; //move next
+ }
+ }
+ //end if buf
+ }//end GetThesaurus
+#endif
}
m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 7550ed1..a1579dd 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
#define USE_RE2 0 /// whether to compile RE2 support
#define USE_RLP 0 /// whether to compile RLP support
#define USE_WINDOWS 1 /// whether to compile for Windows
+ #define USE_MMSEG 1 /// enable mmseg
#define USE_SYSLOG 0 /// whether to use syslog for logging
#define UNALIGNED_RAM_ACCESS 1
@@ -200,7 +201,7 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#endif
#define SPHINX_VERSION "2.3.1" SPHINX_BITS_TAG SPHINX_TAG " (" SPH_SVN_TAGREV ")"
-#define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2015, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2015, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
@@ -208,6 +209,10 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPH_MAX_FILENAME_LEN 512
#define SPH_MAX_FIELDS 256
+#define CORESEEK_BANNER "Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
@@ -491,7 +496,10 @@ struct CSphTokenizerSettings
CSphString m_sBlendChars;
CSphString m_sBlendMode;
CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output
-
+ int m_iDebug; ///< is in tokenizer debug mode.
+#if USE_MMSEG
+ CSphString m_sDictPath; ///coreseek: where to find segmentor's dict.
+#endif
CSphTokenizerSettings ();
};
@@ -602,11 +610,16 @@ public:
/// get synonym file info
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
+ /// mark as debug tokenizer's output --coreseek -mmseg
+ virtual int DumpToken () { return m_tSettings.m_iDebug; }
public:
/// pass next buffer
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
+ /// is pre-tokenized --coreseek
+ virtual bool IsPreTokenized() { return false; }
+
/// set current index schema (only intended for the token filter plugins)
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
@@ -685,6 +698,10 @@ public:
/// set new buffer ptr (must be within current bounds)
virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
+#if USE_MMSEG
+ virtual const BYTE* GetThesaurus(BYTE * , int ) { return NULL; }
+ virtual void ReloadSegDictionary() { return; } // reload mmseg's dictionary.
+#endif
/// get settings hash
virtual uint64_t GetSettingsFNV () const;
@@ -709,6 +726,9 @@ protected:
CSphLowercaser m_tLC; ///< my lowercaser
int m_iLastTokenLen; ///< last token length, in codepoints
bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+ int m_iLastTokenBufferLen; ///< the buffer length -- coreseek; use in mmseg patch.
+#endif
bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint)
int m_iBoundaryOffset; ///< boundary character offset (in bytes)
bool m_bWasSpecial; ///< special token flag
@@ -1814,6 +1834,7 @@ struct CSphSourceSettings
int m_iStopwordStep; ///< position step on stopword token (default is 1)
bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters
bool m_bIndexFieldLens; ///< whether to index field lengths
+ int m_bDebugDump; ///< mmseg charset debug output feature
CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields
CSphVector<CSphString> m_dInfixFields; ///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index a9e5287..644a43b 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -407,6 +407,8 @@ static KeyDesc_t g_dKeysIndex[] =
{ "min_word_len", 0, NULL },
{ "charset_type", KEY_REMOVED, NULL },
{ "charset_table", 0, NULL },
+ { "charset_dictpath", 0, NULL }, //coreseek: mmseg's dictionary path
+ { "charset_debug", 0, NULL }, //coreseek: debug output tokens
{ "ignore_chars", 0, NULL },
{ "min_prefix_len", 0, NULL },
{ "min_infix_len", 0, NULL },
@@ -1142,7 +1144,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
{
tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );
- if ( hIndex ( "ngram_chars" ) )
+ if(hIndex("charset_debug"))
+ tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+ if ( hIndex ( "ngram_chars" ) )
{
if ( tSettings.m_iNgramLen )
tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1150,6 +1155,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
}
+#if USE_MMSEG
+ //XXX:fixme : sphinx changes tokenizer create process
+ if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+ {
+ tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+ tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+ }
+#endif
+
tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1283,6 +1297,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+ tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;
// prefix/infix fields
CSphString sFields;
@@ -1583,12 +1598,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
while ( !sOptConfig )
{
#ifdef SYSCONFDIR
- sOptConfig = SYSCONFDIR "/sphinx.conf";
+ sOptConfig = SYSCONFDIR "/csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
#endif
- sOptConfig = "./sphinx.conf";
+ sOptConfig = "./csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
@@ -1599,9 +1614,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
if ( !sOptConfig )
sphDie ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)" );
+ "./csft.conf)" );
if ( !bQuiet )
fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 9374980..a350b42 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -146,6 +146,7 @@ enum
// where was TOKENIZER_SBCS=1 once
TOKENIZER_UTF8 = 2,
TOKENIZER_NGRAM = 3
+ , TOKENIZER_ZHCN_UTF8 = 4
};
/// load config file
diff --git a/.gitignore b/.gitignore
index 6701f58..43d9f68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,10 @@
/test/ql/data/*.lock
/test/ql/*.class
/test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
fi
])
+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+ do
+ if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+ then
+ MMSEG_CFLAGS="-I$CANDIDATE"
+ break
+ fi
+ done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+ for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+ "/usr/local/lib" "/usr/local/mmseg/lib" \
+ "/usr/local/lib/mmseg" "/usr/lib" \
+ "/opt/mmseg/lib"
+ do
+ if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+ then
+ MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+ break
+ fi
+ done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+ AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+ [ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+ MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+ AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+ [ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+ # Trim trailing '.libs' if user passed it in --with-mysql-libs option
+ ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+ -e 's+.libs/$++'`
+ MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+ AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
dnl ---------------------------------------------------------------------------
dnl Macro: AC_CHECK_LIBSTEMMER
dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 3962440..97d6914 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,6 +69,7 @@ fi
AC_PROG_CC
AC_PROG_CXX
+AM_PROG_AR
AC_PROG_RANLIB
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -439,6 +440,24 @@ else
fi
AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )
+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+ AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+ [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+ AC_MSG_RESULT([yes])
+ AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+ AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+ AC_SUBST([MMSEG_LIBS])
+ AC_SUBST([MMSEG_CFLAGS])
+else
+ AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
# add macports include directory
if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -493,7 +512,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
dnl ---
# we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"
AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
if USE_LIBSTEMMER
noinst_LIBRARIES = libstemmer.a
include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index d5214c6..be187dd 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
sphinxsoundex.cpp sphinxmetaphone.cpp sphinxstemen.cpp sphinxstemru.cpp sphinxstemcz.cpp sphinxstemar.cpp \
sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \
@@ -31,5 +32,9 @@ RLP_INC =
endif
AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 3d136ad..07adc4b 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1722,7 +1722,7 @@ int main ( int argc, char ** argv )
"\n"
"Options are:\n"
"--config <file>\t\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--all\t\t\treindex all configured indexes\n"
"--quiet\t\t\tbe quiet, only print errors\n"
"--verbose\t\tverbose indexing issues report\n"
@@ -1751,8 +1751,8 @@ int main ( int argc, char ** argv )
"--keep-attrs\t\tretain attributes from the old index"
"\n"
"Examples:\n"
- "indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
- "indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+ "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+ "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
}
return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index a845add..6ec9d11 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -18917,7 +18917,7 @@ void ShowHelp ()
"Options are:\n"
"-h, --help\t\tdisplay this help message\n"
"-c, --config <file>\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--stop\t\t\tsend SIGTERM to currently running searchd\n"
"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
"--status\t\tget ant print status variables\n"
@@ -18954,9 +18954,9 @@ void ShowHelp ()
"--safetrace\t\tonly use system backtrace() call in crash reports\n"
"\n"
"Examples:\n"
- "searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+ "searchd --config /usr/local/sphinx/etc/csft.conf\n"
#if USE_WINDOWS
- "searchd --install --config c:\\sphinx\\sphinx.conf\n"
+ "searchd --install --config c:\\sphinx\\csft.conf\n"
#endif
);
}
@@ -22508,12 +22508,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
while ( !g_sConfigFile.cstr() )
{
#ifdef SYSCONFDIR
- g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+ g_sConfigFile = SYSCONFDIR "/";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
#endif
- g_sConfigFile = "./sphinx.conf";
+ g_sConfigFile = "./";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
@@ -22524,9 +22524,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
if ( !g_sConfigFile.cstr () )
sphFatal ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)." );
+ "./csft.conf)." );
sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 97d72c3..7a666da 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -133,6 +133,16 @@
#endif
#endif
+#if ( USE_WINDOWS && USE_MMSEG )
+ #if _DEBUG
+ #pragma comment(linker, "/defaultlib:libcss_d.lib")
+ #else
+ #pragma comment(linker, "/defaultlib:libcss.lib")
+ #endif
+ #pragma message("Automatically linking with libcss.lib")
+ #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
/////////////////////////////////////////////////////////////////////////////
// logf() is not there sometimes (eg. Solaris 9)
@@ -2417,10 +2427,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
public:
CSphTokenizer_UTF8 ();
virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual bool IsPreTokenized() { return m_bPreTokenized; }
virtual BYTE * GetToken ();
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
virtual int GetCodepointLength ( int iCode ) const;
virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+ bool m_bPreTokenized;
};
@@ -2441,6 +2455,78 @@ protected:
CSphString m_sNgramCharsStr;
};
+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+ CSphTokenizer_UTF8MMSeg ();
+ ~CSphTokenizer_UTF8MMSeg() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ }
+
+ virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual BYTE * GetToken ();
+ virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
+ virtual const BYTE* GetThesaurus(BYTE * sBuffer, int iLength );
+ bool IsSegment(const BYTE * pCur);
+
+ CSphTokenizerBase* SetDictPath(const char* path) { m_dictpath = path; return this; }
+
+ virtual const char * GetBufferPtr () const { return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; }
+
+ virtual const char * GetTokenStart () const { return m_segToken; }
+
+ virtual int GetLastTokenLen () const { return m_iLastTokenLenMMSeg; }
+
+ virtual void ReloadSegDictionary() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ m_seg = NULL;
+
+ if(m_mgr) {
+ SegmenterManagerSingleInstance::Free(); // free preexist instance.
+ m_mgr = NULL;
+ }
+ }
+protected:
+ char* m_segToken;
+ size_t m_segoffset;
+ int m_iLastTokenLenMMSeg;
+ BYTE m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator
+ BYTE * m_pAccumSeg; ///< current accumulator position
+ CSphVector<u2> m_tokenlens;
+ int m_tokenpos;
+protected:
+ // virtual bool IsSegment(const BYTE * pCur);
+ CSphString m_dictpath;
+
+ // mmseg related
+ css::Segmenter* m_seg;
+ css::SegmenterManager* m_mgr;
+ css::Segmenter* GetSegmenter(const char* dict_path){
+ int nRet = 0;
+ if(!m_mgr) {
+ m_mgr = SegmenterManagerSingleInstance::Get();
+ if(dict_path)
+ nRet = m_mgr->init(dict_path);
+ }
+ if(nRet == 0 && !m_seg)
+ m_seg = m_mgr->getSegmenter(false);
+ return m_seg;
+ }
+};
+
+#endif
struct CSphNormalForm
{
@@ -2782,6 +2868,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
return new CSphTokenizer_UTF8Ngram<false> ();
}
+#if USE_MMSEG
+ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+ CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+ tokenizer->SetDictPath(dict_path);
+ return tokenizer;
+}
+#endif
+
/////////////////////////////////////////////////////////////////////////////
enum
@@ -3344,6 +3439,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
: m_iType ( TOKENIZER_UTF8 )
, m_iMinWordLen ( 1 )
, m_iNgramLen ( 0 )
+ , m_iDebug ( 0 )
{
}
@@ -3355,7 +3451,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
return true;
tSettings.m_iType = tReader.GetByte ();
- if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
{
sWarning = "can't load an old index with SBCS tokenizer";
return false;
@@ -3383,7 +3483,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
tSettings.m_sIgnoreChars = tReader.GetString ();
tSettings.m_iNgramLen = tReader.GetDword ();
tSettings.m_sNgramChars = tReader.GetString ();
- if ( uVersion>=15 )
+#if USE_MMSEG
+ //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+ tSettings.m_sDictPath = tReader.GetString ();
+#endif
+ if ( uVersion>=15 )
tSettings.m_sBlendChars = tReader.GetString ();
if ( uVersion>=24 )
tSettings.m_sBlendMode = tReader.GetString();
@@ -3414,6 +3518,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
tWriter.PutDword ( tSettings.m_iNgramLen );
tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+ // if turn mmseg off, the index(s) are compat again.
+ tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
}
@@ -3688,6 +3796,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
{
case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break;
case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+ case TOKENIZER_ZHCN_UTF8: pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
default:
sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
return NULL;
@@ -4760,7 +4871,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
{
CSphString sTmp;
SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
- m_bHasBlend = false;
+
+ // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+ // Here just make CJK Charactor will remain. --coreseek
+ // 4e00 - 9fff CJK unified ideographs
+ // 3000 - 303f CJK symbols and punctuation
+ // 3040 - 30ff Hiragana/Katagana
+ // ff00 - ffff half/fullwidth forms
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+ // ENDCJK
+ m_bPreTokenized = false; // by default use original route.
+
+ m_bHasBlend = false;
}
@@ -4770,10 +4898,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
// check that old one is over and that new length is sane
assert ( iLength>=0 );
- // set buffer
+ // set buffer
m_pBuffer = sBuffer;
+ // check is pre-segment buffer, with prefix 0xFFFA
+ // if True, the following should be 0xFFA, 0x41, [ctx] --coreseek
+ m_bPreTokenized = false;
+ if(iLength > 4)
+ {
+ // there is a ' ' (space, 32) as padding. might not true
+ unsigned char mask[] = {32, 239, 191, 186, 65};
+ unsigned char mask_bare[] = {239, 191, 186, 65};
+ if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+ // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+ m_bPreTokenized = true;
+ m_pBuffer += 5;
+ }else
+ if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+ m_bPreTokenized = true;
+ m_pBuffer += 4;
+ }
+ }
+
m_pBufferMax = sBuffer + iLength;
- m_pCur = sBuffer;
+ m_pCur = m_pBuffer;
m_pTokenStart = m_pTokenEnd = NULL;
m_pBlendStart = m_pBlendEnd = NULL;
@@ -4791,7 +4938,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
m_bTokenBoundary = false;
m_bWasSynonym = false;
- return m_bHasBlend
+ return m_bHasBlend
? DoGetToken<IS_QUERY,true>()
: DoGetToken<IS_QUERY,false>();
}
@@ -5209,6 +5356,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
assert ( m_iNgramLen==1 );
return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
}
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+ :CSphTokenizer_UTF8<IS_QUERY>()
+ , m_segoffset(0)
+{
+ //over ride charmap
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+ CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+ m_pAccumSeg = m_sAccumSeg;
+ //m_iLastTokenBufferLen = 0;
+ m_iLastTokenLenMMSeg = 0;
+
+ m_mgr = NULL;
+ m_seg = NULL;
+ m_tokenlens.Reserve(1024*512); // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+ CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+ else
+ sphDie ( " Tokenizer initialization failure. " );
+ m_segoffset = 0;
+ m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+ m_tokenlens.Reset();
+ m_tokenpos = 0;
+ {
+ u2 len = 0, symlen = 0;
+ while(1){
+ len = 0;
+ char* tok = (char*)seg->peekToken(len,symlen);
+ if(!tok || !*tok || !len)
+ break;
+ seg->popToken(len);
+
+ m_tokenlens.Add(len);
+ //printf("%*.*s/p ",symlen,symlen,tok);
+ }
+ }
+}
+
+template < bool IS_QUERY >
+bool CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+ // this code might have bug, but as it will removed in next release...
+ size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+ //if(offset == 0) return false;
+ //printf("pcur: %s\n", pCur);
+
+ //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+ {
+ u2 len = 0, symlen = 0;
+ while(m_segoffset < offset) {
+ //tok = (const char*)seg->peekToken(len, symlen);
+ //seg->popToken(len);
+ len = m_tokenlens[m_tokenpos];
+ m_tokenpos ++;
+ m_segoffset += len;
+ //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+ if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+ //break?
+ break;
+ }
+ }
+ /*
+ printf("\n");
+ printf("seg_off %d vs off %d\n", m_segoffset, offset);
+ if(m_segoffset != offset)
+ printf("seg_pcur: %s\n", pCur);
+ */
+ return (m_segoffset == offset);
+ } //end if seg
+ return true;
+}
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+ //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ m_iLastTokenLenMMSeg = 0;
+ //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+ while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+ {
+ BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ //printf("utf8_token: %s \t ", tok);
+ if(!tok){
+ m_iLastTokenLenMMSeg = 0;
+ return NULL;
+ }
+
+ int token_buf_len = strlen((const char*)tok);
+
+ if(m_pAccumSeg == m_sAccumSeg)
+ m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+ if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN ) {
+ ::memcpy(m_pAccumSeg, tok, token_buf_len);
+ m_pAccumSeg += token_buf_len;
+ m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+ }
+ }
+ {
+ *m_pAccumSeg = 0;
+ //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+ m_pAccumSeg = m_sAccumSeg;
+
+ return m_sAccumSeg;
+ }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+ CSphTokenizerBase * pClone;
+ if ( eMode!=SPH_CLONE_INDEX ) {
+ pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+ }else{
+ pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+ }
+ pClone->CloneBase ( this, eMode );
+ return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+ return NULL;
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
@@ -8539,6 +8832,7 @@ void CSphIndex::SetupQueryTokenizer()
// create and setup a master copy of query time tokenizer
// that we can then use to create lightweight clones
SafeDelete ( m_pQueryTokenizer );
+ m_pTokenizer->ReloadSegDictionary();
m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
sphSetupQueryTokenizer ( m_pQueryTokenizer, IsStarDict(), m_tSettings.m_bIndexExactWords );
}
@@ -24721,6 +25015,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
m_bIndexExactWords = tSettings.m_bIndexExactWords;
m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+ m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
m_bIndexSP = tSettings.m_bIndexSP;
m_dPrefixFields = tSettings.m_dPrefixFields;
m_dInfixFields = tSettings.m_dInfixFields;
@@ -25333,9 +25628,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
{
+ //FIXME: dump token to console --coreseek
+ //debug dump
+ if(m_pTokenizer->DumpToken()) {
+ printf("%s_x ", sWord); // make the same as pre-tokenized text.
+ }
+
+ // fix sWork if in pre-tokenized mode.
+ int iBytes = strlen ( (const char*)sWord );
+ bool bAdvancePos = true;
+ if(m_pTokenizer->IsPreTokenized()) {
+ // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+ if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+ bAdvancePos = false; // not an advance token.
+ sWord[iBytes-2] = '\0'; // change token_x -> token\0x
+ iBytes -= 2; // decrease length
+ }
+
+
int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
- if ( !bPayload )
+
+ if ( !bPayload && bAdvancePos)
{
HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
if ( m_pTokenizer->GetBoundary() )
@@ -25347,7 +25661,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
if ( bGlobalPartialMatch )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD;
sBuf[iBytes+1] = '\0';
@@ -25357,7 +25671,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
sBuf[iBytes+1] = '\0';
@@ -25395,6 +25709,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
} else
m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+ // works only when mmseg is on.
+ // zh_cn only GetThesaurus
+ {
+ int iBytes = strlen ( (const char*)sWord );
+ const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+ if(tbuf_ptr) {
+ while(*tbuf_ptr) {
+ size_t len = strlen((const char*)tbuf_ptr);
+ SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+ if ( iWord ) {
+ m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+ // mmseg; do not inc step for we are in 'one' hit.
+ //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+ }
+ tbuf_ptr += len + 1; //move next
+ }
+ }
+ //end if buf
+ }//end GetThesaurus
+#endif
}
m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 8d033f6..a22ed81 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
#define USE_RE2 0 /// whether to compile RE2 support
#define USE_RLP 0 /// whether to compile RLP support
#define USE_WINDOWS 1 /// whether to compile for Windows
+ #define USE_MMSEG 1 /// enable mmseg
#define USE_SYSLOG 0 /// whether to use syslog for logging
#define HAVE_STRNLEN 1
@@ -212,7 +213,7 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPHINX_VERSION_NUMBERS "2.3.2"
#define SPHINX_VERSION SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
-#define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
@@ -220,6 +221,10 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPH_MAX_FILENAME_LEN 512
#define SPH_MAX_FIELDS 256
+#define CORESEEK_BANNER "Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
@@ -497,7 +502,10 @@ struct CSphTokenizerSettings
CSphString m_sBlendChars;
CSphString m_sBlendMode;
CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output
-
+ int m_iDebug; ///< is in tokenizer debug mode.
+#if USE_MMSEG
+ CSphString m_sDictPath; ///coreseek: where to find segmentor's dict.
+#endif
CSphTokenizerSettings ();
};
@@ -597,11 +605,16 @@ public:
/// get synonym file info
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
+ /// mark as debug tokenizer's output --coreseek -mmseg
+ virtual int DumpToken () { return m_tSettings.m_iDebug; }
public:
/// pass next buffer
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
+ /// is pre-tokenized --coreseek
+ virtual bool IsPreTokenized() { return false; }
+
/// set current index schema (only intended for the token filter plugins)
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
@@ -679,6 +692,10 @@ public:
/// set new buffer ptr (must be within current bounds)
virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
+#if USE_MMSEG
+ virtual const BYTE* GetThesaurus(BYTE * , int ) { return NULL; }
+ virtual void ReloadSegDictionary() { return; } // reload mmseg's dictionary.
+#endif
/// get settings hash
virtual uint64_t GetSettingsFNV () const;
@@ -701,6 +718,9 @@ protected:
CSphLowercaser m_tLC; ///< my lowercaser
int m_iLastTokenLen; ///< last token length, in codepoints
bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+ int m_iLastTokenBufferLen; ///< the buffer length -- coreseek; use in mmseg patch.
+#endif
bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint)
int m_iBoundaryOffset; ///< boundary character offset (in bytes)
bool m_bWasSpecial; ///< special token flag
@@ -1820,6 +1840,7 @@ struct CSphSourceSettings
int m_iStopwordStep; ///< position step on stopword token (default is 1)
bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters
bool m_bIndexFieldLens; ///< whether to index field lengths
+ int m_bDebugDump; ///< mmseg charset debug output feature
CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields
CSphVector<CSphString> m_dInfixFields; ///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 7d975c6..7528970 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -529,6 +529,8 @@ static KeyDesc_t g_dKeysIndex[] =
{ "min_word_len", 0, NULL },
{ "charset_type", KEY_REMOVED, NULL },
{ "charset_table", 0, NULL },
+ { "charset_dictpath", 0, NULL }, //coreseek: mmseg's dictionary path
+ { "charset_debug", 0, NULL }, //coreseek: debug output tokens
{ "ignore_chars", 0, NULL },
{ "min_prefix_len", 0, NULL },
{ "min_infix_len", 0, NULL },
@@ -1267,7 +1269,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
{
tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );
- if ( hIndex ( "ngram_chars" ) )
+ if(hIndex("charset_debug"))
+ tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+ if ( hIndex ( "ngram_chars" ) )
{
if ( tSettings.m_iNgramLen )
tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1275,6 +1280,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
}
+#if USE_MMSEG
+ //XXX:fixme : sphinx changes tokenizer create process
+ if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+ {
+ tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+ tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+ }
+#endif
+
tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1408,6 +1422,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+ tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;
// prefix/infix fields
CSphString sFields;
@@ -1715,12 +1730,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
while ( !sOptConfig )
{
#ifdef SYSCONFDIR
- sOptConfig = SYSCONFDIR "/sphinx.conf";
+ sOptConfig = SYSCONFDIR "/csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
#endif
- sOptConfig = "./sphinx.conf";
+ sOptConfig = "./csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
@@ -1731,9 +1746,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
if ( !sOptConfig )
sphDie ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)" );
+ "./csft.conf)" );
if ( !bQuiet )
fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 776386c..1221a82 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,7 @@ enum
// where was TOKENIZER_SBCS=1 once
TOKENIZER_UTF8 = 2,
TOKENIZER_NGRAM = 3
+ , TOKENIZER_ZHCN_UTF8 = 4
};
/// load config file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment