fffonion/csft-sphinx-2.2.11.patch

## csft-sphinx-2.2.11.patch
diff --git a/.gitignore b/.gitignore
index f5be264..518fcc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,10 @@
 /test/ql/data/*.lock
 /test/ql/*.class
 /test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
 fi
 ])

+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+	do
+		if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+		then
+			MMSEG_CFLAGS="-I$CANDIDATE"
+			break
+		fi
+	done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+	for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+		"/usr/local/lib" "/usr/local/mmseg/lib" \
+		"/usr/local/lib/mmseg" "/usr/lib" \
+		"/opt/mmseg/lib"
+	do
+		if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+		then
+			MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+			break
+		fi
+	done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+	AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+	[ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+	MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+	AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+	[ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+	# Trim trailing '.libs' if user passed it in --with-mysql-libs option
+	ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+		-e 's+.libs/$++'`
+	MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+	AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
 dnl ---------------------------------------------------------------------------
 dnl Macro: AC_CHECK_LIBSTEMMER
 dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 643f5ca..e9a961b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,6 +66,7 @@ fi

 AC_PROG_CC
 AC_PROG_CXX
+AM_PROG_AR
 AC_PROG_RANLIB

 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -426,6 +427,24 @@ else
 fi
 AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )

+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+            AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+            [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+       AC_MSG_RESULT([yes])
+    AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+    AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+    AC_SUBST([MMSEG_LIBS])
+    AC_SUBST([MMSEG_CFLAGS])
+else
+       AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
 # add macports include directory
 if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
    MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -480,7 +499,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
 dnl ---

 # we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"


 AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 if USE_LIBSTEMMER
 noinst_LIBRARIES = libstemmer.a
 include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index 048a112..9197000 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
 endif

 AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
 COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
 LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 7f294f6..7ba641e 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1768,7 +1768,7 @@ int main ( int argc, char ** argv )
 				"\n"
 				"Options are:\n"
 				"--config <file>\t\tread configuration from specified file\n"
-				"\t\t\t(default is sphinx.conf)\n"
+                "\t\t\t(default is csft.conf)\n"
 				"--all\t\t\treindex all configured indexes\n"
 				"--quiet\t\t\tbe quiet, only print errors\n"
 				"--verbose\t\tverbose indexing issues report\n"
@@ -1797,8 +1797,8 @@ int main ( int argc, char ** argv )
 				"--keep-attrs\t\tretain attributes from the old index"
 				"\n"
 				"Examples:\n"
-				"indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
-				"indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+                "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+                "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
 		}

 		return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index 85b1cd6..28ef919 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -22030,7 +22030,7 @@ void ShowHelp ()
 		"Options are:\n"
 		"-h, --help\t\tdisplay this help message\n"
 		"-c, --config <file>\tread configuration from specified file\n"
-		"\t\t\t(default is sphinx.conf)\n"
+        "\t\t\t(default is csft.conf)\n"
 		"--stop\t\t\tsend SIGTERM to currently running searchd\n"
 		"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
 		"--status\t\tget ant print status variables\n"
@@ -22067,9 +22067,9 @@ void ShowHelp ()
 		"--safetrace\t\tonly use system backtrace() call in crash reports\n"
 		"\n"
 		"Examples:\n"
-		"searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+        "searchd --config /usr/local/sphinx/etc/csft.conf\n"
 #if USE_WINDOWS
-		"searchd --install --config c:\\sphinx\\sphinx.conf\n"
+        "searchd --install --config c:\\sphinx\\csft.conf\n"
 #endif
 		);
 }
@@ -23338,12 +23338,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	while ( !g_sConfigFile.cstr() )
 	{
 #ifdef SYSCONFDIR
-		g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+        g_sConfigFile = SYSCONFDIR "/";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;
 #endif

-		g_sConfigFile = "./sphinx.conf";
+        g_sConfigFile = "./";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;

@@ -23354,9 +23354,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	if ( !g_sConfigFile.cstr () )
 		sphFatal ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-			SYSCONFDIR "/sphinx.conf, "
+            SYSCONFDIR "/csft.conf, "
 #endif
-			"./sphinx.conf)." );
+            "./csft.conf)." );

 	sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index d6a7b9d..73e92b0 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -143,6 +143,16 @@
 	#pragma message("Automatically linking with btutils.lib")
 #endif

+#if ( USE_WINDOWS && USE_MMSEG )
+	#if _DEBUG
+		#pragma comment(linker, "/defaultlib:libcss_d.lib")
+	#else
+		#pragma comment(linker, "/defaultlib:libcss.lib")
+	#endif
+	#pragma message("Automatically linking with libcss.lib")
+    #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 // logf() is not there sometimes (eg. Solaris 9)
@@ -2550,10 +2560,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
 public:
 								CSphTokenizer_UTF8 ();
 	virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual bool                IsPreTokenized()    {   return m_bPreTokenized;   }
 	virtual BYTE *				GetToken ();
 	virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
 	virtual int					GetCodepointLength ( int iCode ) const;
 	virtual int					GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+    bool    m_bPreTokenized;
 };


@@ -2574,6 +2588,78 @@ protected:
 	CSphString			m_sNgramCharsStr;
 };

+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+                                CSphTokenizer_UTF8MMSeg ();
+                                ~CSphTokenizer_UTF8MMSeg() {
+                                    if(m_seg){
+                                        SafeDelete ( m_seg );
+                                    }
+                                }
+
+    virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual BYTE *				GetToken ();
+    virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
+    virtual const BYTE*			GetThesaurus(BYTE * sBuffer, int iLength );
+    bool                        IsSegment(const BYTE * pCur);
+
+    CSphTokenizerBase* SetDictPath(const char* path) {	m_dictpath = path; return this; }
+
+    virtual const char *	GetBufferPtr () const		{ 		return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;   }
+
+    virtual const char *			GetTokenStart () const		{ 	return m_segToken;      }
+
+    virtual int						GetLastTokenLen () const    {   return m_iLastTokenLenMMSeg;    }
+
+    virtual void                    ReloadSegDictionary()    {
+        if(m_seg){
+            SafeDelete ( m_seg );
+        }
+        m_seg = NULL;
+
+        if(m_mgr) {
+            SegmenterManagerSingleInstance::Free(); // free preexist instance.
+            m_mgr = NULL;
+        }
+    }
+protected:
+    char*               m_segToken;
+    size_t              m_segoffset;
+    int                 m_iLastTokenLenMMSeg;
+    BYTE				m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
+    BYTE *				m_pAccumSeg;							///< current accumulator position
+    CSphVector<u2>      m_tokenlens;
+    int                 m_tokenpos;
+protected:
+    // virtual bool				IsSegment(const BYTE * pCur);
+    CSphString m_dictpath;
+
+    // mmseg related
+    css::Segmenter* m_seg;
+    css::SegmenterManager* m_mgr;
+    css::Segmenter* GetSegmenter(const char* dict_path){
+        int nRet = 0;
+        if(!m_mgr) {
+            m_mgr = SegmenterManagerSingleInstance::Get();
+            if(dict_path)
+                nRet = m_mgr->init(dict_path);
+        }
+        if(nRet == 0 && !m_seg)
+            m_seg = m_mgr->getSegmenter(false);
+        return m_seg;
+    }
+};
+
+#endif

 struct CSphNormalForm
 {
@@ -3794,6 +3880,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }

+#if USE_MMSEG
+ISphTokenizer *	sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+    CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+    tokenizer->SetDictPath(dict_path);
+    return tokenizer;
+}
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 enum
@@ -4379,6 +4474,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
 	: m_iType				( TOKENIZER_UTF8 )
 	, m_iMinWordLen			( 1 )
 	, m_iNgramLen			( 0 )
+    , m_iDebug				( 0 )
 {
 }

@@ -4390,7 +4486,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;

 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
 	{
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
@@ -4418,7 +4518,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 	tSettings.m_sIgnoreChars = tReader.GetString ();
 	tSettings.m_iNgramLen = tReader.GetDword ();
 	tSettings.m_sNgramChars = tReader.GetString ();
-	if ( uVersion>=15 )
+#if USE_MMSEG
+    //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+    tSettings.m_sDictPath		= tReader.GetString ();
+#endif
+    if ( uVersion>=15 )
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
@@ -4449,6 +4553,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
 	tWriter.PutDword ( tSettings.m_iNgramLen );
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+    // if turn mmseg off, the index(s) are compat again.
+    tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
 }
@@ -4723,6 +4831,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+        case TOKENIZER_ZHCN_UTF8:   pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
 			return NULL;
@@ -5968,7 +6079,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
 {
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
-	m_bHasBlend = false;
+
+    // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+    // Here just make CJK Charactor will remain. --coreseek
+    // 4e00 - 9fff CJK unified ideographs
+    // 3000 - 303f CJK symbols and punctuation
+    // 3040 - 30ff Hiragana/Katagana
+    // ff00 - ffff half/fullwidth forms
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+    // ENDCJK
+    m_bPreTokenized = false;    // by default use original route.
+
+    m_bHasBlend = false;
 }


@@ -5978,10 +6106,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 	// check that old one is over and that new length is sane
 	assert ( iLength>=0 );

-	// set buffer
+    // set buffer
 	m_pBuffer = sBuffer;
+    // check is pre-segment buffer, with prefix 0xFFFA
+    // if True, the following should be 0xFFA, 0x41, [ctx]      --coreseek
+    m_bPreTokenized = false;
+    if(iLength > 4)
+    {
+        // there is a ' ' (space, 32) as padding. might not true
+        unsigned char mask[] = {32, 239, 191, 186, 65};
+        unsigned char mask_bare[] = {239, 191, 186, 65};
+        if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+            // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+            m_bPreTokenized = true;
+            m_pBuffer += 5;
+        }else
+        if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+            m_bPreTokenized = true;
+            m_pBuffer += 4;
+        }
+    }
+
 	m_pBufferMax = sBuffer + iLength;
-	m_pCur = sBuffer;
+    m_pCur = m_pBuffer;
 	m_pTokenStart = m_pTokenEnd = NULL;
 	m_pBlendStart = m_pBlendEnd = NULL;

@@ -5999,7 +6146,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
 	m_bTokenBoundary = false;
 	m_bWasSynonym = false;

-	return m_bHasBlend
+    return m_bHasBlend
 		? DoGetToken<IS_QUERY,true>()
 		: DoGetToken<IS_QUERY,false>();
 }
@@ -6414,6 +6561,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	assert ( m_iNgramLen==1 );
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+        :CSphTokenizer_UTF8<IS_QUERY>()
+        , m_segoffset(0)
+{
+    //over ride charmap
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+    m_pAccumSeg = m_sAccumSeg;
+    //m_iLastTokenBufferLen = 0;
+    m_iLastTokenLenMMSeg = 0;
+
+    m_mgr = NULL;
+    m_seg = NULL;
+    m_tokenlens.Reserve(1024*512);  // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+    CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+    else
+        sphDie ( " Tokenizer initialization failure. " );
+    m_segoffset = 0;
+    m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+    m_tokenlens.Reset();
+    m_tokenpos = 0;
+    {
+    	u2 len = 0, symlen = 0;
+        while(1){
+            len = 0;
+            char* tok = (char*)seg->peekToken(len,symlen);
+            if(!tok || !*tok || !len)
+                break;
+            seg->popToken(len);
+
+            m_tokenlens.Add(len);
+            //printf("%*.*s/p ",symlen,symlen,tok);
+        }
+    }
+}
+
+template < bool IS_QUERY >
+bool	CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+    // this code might have bug, but as it will removed in next release...
+    size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+    //if(offset == 0)	return false;
+    //printf("pcur: %s\n", pCur);
+
+    //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+    {
+        u2 len = 0, symlen = 0;
+        while(m_segoffset < offset) {
+            //tok = (const char*)seg->peekToken(len, symlen);
+            //seg->popToken(len);
+            len = m_tokenlens[m_tokenpos];
+            m_tokenpos ++;
+            m_segoffset += len;
+            //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+            if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+                //break?
+                break;
+            }
+        }
+        /*
+        printf("\n");
+        printf("seg_off %d vs off %d\n", m_segoffset, offset);
+        if(m_segoffset != offset)
+        	printf("seg_pcur: %s\n", pCur);
+        */
+        return (m_segoffset == offset);
+    } //end if seg
+    return true;
+}
+
+template < bool IS_QUERY >
+BYTE *	CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+    //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+    m_iLastTokenLenMMSeg = 0;
+    //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+    while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+    {
+        BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+        //printf("utf8_token: %s \t ", tok);
+        if(!tok){
+            m_iLastTokenLenMMSeg = 0;
+            return NULL;
+        }
+
+        int token_buf_len = strlen((const char*)tok);
+
+        if(m_pAccumSeg == m_sAccumSeg)
+            m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+        if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN )  {
+            ::memcpy(m_pAccumSeg, tok, token_buf_len);
+            m_pAccumSeg += token_buf_len;
+            m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+        }
+    }
+    {
+        *m_pAccumSeg = 0;
+        //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+        m_pAccumSeg = m_sAccumSeg;
+
+        return m_sAccumSeg;
+    }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+    CSphTokenizerBase * pClone;
+    if ( eMode!=SPH_CLONE_INDEX ) {
+        pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+    }else{
+        pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+    }
+    pClone->CloneBase ( this, eMode );
+    return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+    return NULL;
+}
+
+#endif

 //////////////////////////////////////////////////////////////////////////

@@ -9683,6 +9976,7 @@ void CSphIndex::SetupQueryTokenizer()
 	// create and setup a master copy of query time tokenizer
 	// that we can then use to create lightweight clones
 	SafeDelete ( m_pQueryTokenizer );
+    m_pTokenizer->ReloadSegDictionary();
 	m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
 	if ( IsStarDict() )
 	{
@@ -25994,6 +26288,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+    m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
 	m_bIndexSP = tSettings.m_bIndexSP;
 	m_dPrefixFields = tSettings.m_dPrefixFields;
 	m_dInfixFields = tSettings.m_dInfixFields;
@@ -26599,11 +26894,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
 	{
+        //FIXME: dump token to console --coreseek
+        //debug dump
+        if(m_pTokenizer->DumpToken()) {
+            printf("%s_x ", sWord); // make the same as pre-tokenized text.
+        }
+
+        // fix sWork if in pre-tokenized mode.
+        int iBytes = strlen ( (const char*)sWord );
+        bool bAdvancePos = true;
+        if(m_pTokenizer->IsPreTokenized()) {
+            // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+            if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+                bAdvancePos = false;  // not an advance token.
+            sWord[iBytes-2] = '\0'; // change token_x   -> token\0x
+            iBytes -= 2;    // decrease length
+        }
+
 		m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );

 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );

-		if ( !bPayload )
+        if ( !bPayload && bAdvancePos)
 		{
 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
 			if ( m_pTokenizer->GetBoundary() )
@@ -26615,7 +26927,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b

 		if ( bGlobalPartialMatch )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD;
 			sBuf[iBytes+1] = '\0';
@@ -26625,7 +26937,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 		ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
 		if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
 			sBuf[iBytes+1] = '\0';
@@ -26661,6 +26973,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
 		} else
 			m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+        // works only when mmseg is on.
+        // zh_cn only GetThesaurus
+        {
+            int iBytes = strlen ( (const char*)sWord );
+            const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+            if(tbuf_ptr) {
+                while(*tbuf_ptr) {
+                    size_t len = strlen((const char*)tbuf_ptr);
+                    SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+                    if ( iWord ) {
+                        m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+                        // mmseg; do not inc step for we are in 'one' hit.
+                        //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+                    }
+                    tbuf_ptr += len + 1; //move next
+                }
+            }
+            //end if buf
+        }//end GetThesaurus
+#endif
 	}

 	m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index a8f16ca..07453bc 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
 	#define	USE_RE2			0	/// whether to compile RE2 support
 	#define USE_RLP			0	/// whether to compile RLP support
 	#define USE_WINDOWS		1	/// whether to compile for Windows
+    #define USE_MMSEG		1   /// enable mmseg
 	#define USE_SYSLOG		0	/// whether to use syslog for logging
 	#define HAVE_STRNLEN	1

@@ -208,7 +209,7 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPHINX_VERSION_NUMBERS    "2.2.11"

 #define SPHINX_VERSION           SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
-#define SPHINX_BANNER			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
 #define SPHINX_SEARCHD_PROTO	1
 #define SPHINX_CLIENT_VERSION	1

@@ -216,6 +217,10 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPH_MAX_FILENAME_LEN	512
 #define SPH_MAX_FIELDS			256

+#define CORESEEK_BANNER			"Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
 /////////////////////////////////////////////////////////////////////////////

 extern int64_t g_iIndexerCurrentDocID;
@@ -499,7 +504,10 @@ struct CSphTokenizerSettings
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
-
+    int                 m_iDebug;           ///< is in tokenizer debug mode.
+#if USE_MMSEG
+    CSphString			m_sDictPath;        ///coreseek: where to find segmentor's dict.
+#endif
 						CSphTokenizerSettings ();
 };

@@ -610,11 +618,16 @@ public:

 	/// get synonym file info
 	virtual const CSphSavedFile &	GetSynFileInfo () const { return m_tSynFileInfo; }
+    /// mark as debug tokenizer's output --coreseek -mmseg
+    virtual int					DumpToken () { return m_tSettings.m_iDebug; }

 public:
 	/// pass next buffer
 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;

+    /// is pre-tokenized            --coreseek
+    virtual bool                    IsPreTokenized()    {   return false;   }
+
 	/// set current index schema (only intended for the token filter plugins)
 	virtual bool					SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }

@@ -697,6 +710,10 @@ public:
 	/// set new buffer ptr (must be within current bounds)
 	virtual void					SetBufferPtr ( const char * sNewPtr ) = 0;

+#if USE_MMSEG
+    virtual const BYTE*				GetThesaurus(BYTE * , int  ) { return NULL; }
+    virtual void                    ReloadSegDictionary()    {   return; }       // reload mmseg's dictionary.
+#endif
 	/// get settings hash
 	virtual uint64_t				GetSettingsFNV () const;

@@ -721,6 +738,9 @@ protected:
 	CSphLowercaser					m_tLC;						///< my lowercaser
 	int								m_iLastTokenLen;			///< last token length, in codepoints
 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+    int								m_iLastTokenBufferLen;		///< the buffer length -- coreseek;	use in mmseg patch.
+#endif
 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
 	int								m_iBoundaryOffset;			///< boundary character offset (in bytes)
 	bool							m_bWasSpecial;				///< special token flag
@@ -1826,6 +1846,7 @@ struct CSphSourceSettings
 	int		m_iStopwordStep;	///< position step on stopword token (default is 1)
 	bool	m_bIndexSP;			///< whether to index sentence and paragraph delimiters
 	bool	m_bIndexFieldLens;	///< whether to index field lengths
+    int		m_bDebugDump;		///< mmseg charset debug output feature

 	CSphVector<CSphString>	m_dPrefixFields;	///< list of prefix fields
 	CSphVector<CSphString>	m_dInfixFields;		///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 13ed63a..05e7b2e 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -530,6 +530,8 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "min_word_len",			0, NULL },
 	{ "charset_type",			KEY_REMOVED, NULL },
 	{ "charset_table",			0, NULL },
+    { "charset_dictpath",		0, NULL }, //coreseek: mmseg's dictionary path
+    { "charset_debug",			0, NULL }, //coreseek: debug output tokens
 	{ "ignore_chars",			0, NULL },
 	{ "min_prefix_len",			0, NULL },
 	{ "min_infix_len",			0, NULL },
@@ -1256,7 +1258,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 {
 	tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );

-	if ( hIndex ( "ngram_chars" ) )
+    if(hIndex("charset_debug"))
+        tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+    if ( hIndex ( "ngram_chars" ) )
 	{
 		if ( tSettings.m_iNgramLen )
 			tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1264,6 +1269,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}

+#if USE_MMSEG
+    //XXX:fixme : sphinx changes tokenizer create process
+    if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+    {
+        tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+        tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+    }
+#endif
+
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1397,6 +1411,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
 	tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
 	tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
 	tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+    tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;

 	// prefix/infix fields
 	CSphString sFields;
@@ -1697,12 +1712,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	while ( !sOptConfig )
 	{
 #ifdef SYSCONFDIR
-		sOptConfig = SYSCONFDIR "/sphinx.conf";
+        sOptConfig = SYSCONFDIR "/csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;
 #endif

-		sOptConfig = "./sphinx.conf";
+        sOptConfig = "./csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;

@@ -1713,9 +1728,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	if ( !sOptConfig )
 		sphDie ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-		SYSCONFDIR "/sphinx.conf, "
+        SYSCONFDIR "/csft.conf, "
 #endif
-		"./sphinx.conf)" );
+        "./csft.conf)" );

 	if ( !bQuiet )
 		fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 9542afd..9a8a625 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,7 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+    , TOKENIZER_ZHCN_UTF8 = 4
 };

 /// load config file

## csft-sphinx-2.3.1.patch
diff --git a/.gitignore b/.gitignore
index 9c2c126..5ff0e50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,3 +63,10 @@
 /test/ql/data/*.lock
 /test/ql/*.class
 /test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
 fi
 ])

+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+	do
+		if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+		then
+			MMSEG_CFLAGS="-I$CANDIDATE"
+			break
+		fi
+	done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+	for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+		"/usr/local/lib" "/usr/local/mmseg/lib" \
+		"/usr/local/lib/mmseg" "/usr/lib" \
+		"/opt/mmseg/lib"
+	do
+		if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+		then
+			MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+			break
+		fi
+	done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+	AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+	[ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+	MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+	AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+	[ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+	# Trim trailing '.libs' if user passed it in --with-mysql-libs option
+	ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+		-e 's+.libs/$++'`
+	MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+	AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
 dnl ---------------------------------------------------------------------------
 dnl Macro: AC_CHECK_LIBSTEMMER
 dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 96fa3b4..f614a10 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,6 +66,7 @@ fi

 AC_PROG_CC
 AC_PROG_CXX
+AM_PROG_AR
 AC_PROG_RANLIB

 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -426,6 +427,24 @@ else
 fi
 AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )

+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+            AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+            [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+       AC_MSG_RESULT([yes])
+    AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+    AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+    AC_SUBST([MMSEG_LIBS])
+    AC_SUBST([MMSEG_CFLAGS])
+else
+       AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
 # add macports include directory
 if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
    MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -480,7 +499,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
 dnl ---

 # we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"


 AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 if USE_LIBSTEMMER
 noinst_LIBRARIES = libstemmer.a
 include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index 63b7d8f..3a1ba55 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
 endif

 AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
 COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
 LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index fa2296d..00a2996 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1766,7 +1766,7 @@ int main ( int argc, char ** argv )
 				"\n"
 				"Options are:\n"
 				"--config <file>\t\tread configuration from specified file\n"
-				"\t\t\t(default is sphinx.conf)\n"
+                "\t\t\t(default is csft.conf)\n"
 				"--all\t\t\treindex all configured indexes\n"
 				"--quiet\t\t\tbe quiet, only print errors\n"
 				"--verbose\t\tverbose indexing issues report\n"
@@ -1795,8 +1795,8 @@ int main ( int argc, char ** argv )
 				"--keep-attrs\t\tretain attributes from the old index"
 				"\n"
 				"Examples:\n"
-				"indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
-				"indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+                "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+                "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
 		}

 		return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index 43d2ab8..6619e69 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -20772,7 +20772,7 @@ void ShowHelp ()
 		"Options are:\n"
 		"-h, --help\t\tdisplay this help message\n"
 		"-c, --config <file>\tread configuration from specified file\n"
-		"\t\t\t(default is sphinx.conf)\n"
+        "\t\t\t(default is csft.conf)\n"
 		"--stop\t\t\tsend SIGTERM to currently running searchd\n"
 		"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
 		"--status\t\tget ant print status variables\n"
@@ -20809,9 +20809,9 @@ void ShowHelp ()
 		"--safetrace\t\tonly use system backtrace() call in crash reports\n"
 		"\n"
 		"Examples:\n"
-		"searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+        "searchd --config /usr/local/sphinx/etc/csft.conf\n"
 #if USE_WINDOWS
-		"searchd --install --config c:\\sphinx\\sphinx.conf\n"
+        "searchd --install --config c:\\sphinx\\csft.conf\n"
 #endif
 		);
 }
@@ -23833,12 +23833,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	while ( !g_sConfigFile.cstr() )
 	{
 #ifdef SYSCONFDIR
-		g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+        g_sConfigFile = SYSCONFDIR "/";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;
 #endif

-		g_sConfigFile = "./sphinx.conf";
+        g_sConfigFile = "./";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;

@@ -23849,9 +23849,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	if ( !g_sConfigFile.cstr () )
 		sphFatal ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-			SYSCONFDIR "/sphinx.conf, "
+            SYSCONFDIR "/csft.conf, "
 #endif
-			"./sphinx.conf)." );
+            "./csft.conf)." );

 	sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index c63293f..3df0d2f 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -142,6 +142,16 @@
 	#pragma message("Automatically linking with btutils.lib")
 #endif

+#if ( USE_WINDOWS && USE_MMSEG )
+	#if _DEBUG
+		#pragma comment(linker, "/defaultlib:libcss_d.lib")
+	#else
+		#pragma comment(linker, "/defaultlib:libcss.lib")
+	#endif
+	#pragma message("Automatically linking with libcss.lib")
+    #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 // logf() is not there sometimes (eg. Solaris 9)
@@ -2556,10 +2566,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
 public:
 								CSphTokenizer_UTF8 ();
 	virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual bool                IsPreTokenized()    {   return m_bPreTokenized;   }
 	virtual BYTE *				GetToken ();
 	virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
 	virtual int					GetCodepointLength ( int iCode ) const;
 	virtual int					GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+    bool    m_bPreTokenized;
 };


@@ -2580,6 +2594,78 @@ protected:
 	CSphString			m_sNgramCharsStr;
 };

+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+                                CSphTokenizer_UTF8MMSeg ();
+                                ~CSphTokenizer_UTF8MMSeg() {
+                                    if(m_seg){
+                                        SafeDelete ( m_seg );
+                                    }
+                                }
+
+    virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual BYTE *				GetToken ();
+    virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
+    virtual const BYTE*			GetThesaurus(BYTE * sBuffer, int iLength );
+    bool                        IsSegment(const BYTE * pCur);
+
+    CSphTokenizerBase* SetDictPath(const char* path) {	m_dictpath = path; return this; }
+
+    virtual const char *	GetBufferPtr () const		{ 		return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;   }
+
+    virtual const char *			GetTokenStart () const		{ 	return m_segToken;      }
+
+    virtual int						GetLastTokenLen () const    {   return m_iLastTokenLenMMSeg;    }
+
+    virtual void                    ReloadSegDictionary()    {
+        if(m_seg){
+            SafeDelete ( m_seg );
+        }
+        m_seg = NULL;
+
+        if(m_mgr) {
+            SegmenterManagerSingleInstance::Free(); // free preexist instance.
+            m_mgr = NULL;
+        }
+    }
+protected:
+    char*               m_segToken;
+    size_t              m_segoffset;
+    int                 m_iLastTokenLenMMSeg;
+    BYTE				m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
+    BYTE *				m_pAccumSeg;							///< current accumulator position
+    CSphVector<u2>      m_tokenlens;
+    int                 m_tokenpos;
+protected:
+    // virtual bool				IsSegment(const BYTE * pCur);
+    CSphString m_dictpath;
+
+    // mmseg related
+    css::Segmenter* m_seg;
+    css::SegmenterManager* m_mgr;
+    css::Segmenter* GetSegmenter(const char* dict_path){
+        int nRet = 0;
+        if(!m_mgr) {
+            m_mgr = SegmenterManagerSingleInstance::Get();
+            if(dict_path)
+                nRet = m_mgr->init(dict_path);
+        }
+        if(nRet == 0 && !m_seg)
+            m_seg = m_mgr->getSegmenter(false);
+        return m_seg;
+    }
+};
+
+#endif

 struct CSphNormalForm
 {
@@ -3798,6 +3884,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }

+#if USE_MMSEG
+ISphTokenizer *	sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+    CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+    tokenizer->SetDictPath(dict_path);
+    return tokenizer;
+}
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 enum
@@ -4383,6 +4478,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
 	: m_iType				( TOKENIZER_UTF8 )
 	, m_iMinWordLen			( 1 )
 	, m_iNgramLen			( 0 )
+    , m_iDebug				( 0 )
 {
 }

@@ -4394,7 +4490,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;

 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
 	{
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
@@ -4422,7 +4522,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 	tSettings.m_sIgnoreChars = tReader.GetString ();
 	tSettings.m_iNgramLen = tReader.GetDword ();
 	tSettings.m_sNgramChars = tReader.GetString ();
-	if ( uVersion>=15 )
+#if USE_MMSEG
+    //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+    tSettings.m_sDictPath		= tReader.GetString ();
+#endif
+    if ( uVersion>=15 )
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
@@ -4453,6 +4557,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
 	tWriter.PutDword ( tSettings.m_iNgramLen );
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+    // if turn mmseg off, the index(s) are compat again.
+    tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
 }
@@ -4727,6 +4835,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+        case TOKENIZER_ZHCN_UTF8:   pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
 			return NULL;
@@ -5966,7 +6077,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
 {
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
-	m_bHasBlend = false;
+
+    // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+    // Here just make CJK Charactor will remain. --coreseek
+    // 4e00 - 9fff CJK unified ideographs
+    // 3000 - 303f CJK symbols and punctuation
+    // 3040 - 30ff Hiragana/Katagana
+    // ff00 - ffff half/fullwidth forms
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+    // ENDCJK
+    m_bPreTokenized = false;    // by default use original route.
+
+    m_bHasBlend = false;
 }


@@ -5976,10 +6104,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 	// check that old one is over and that new length is sane
 	assert ( iLength>=0 );

-	// set buffer
+    // set buffer
 	m_pBuffer = sBuffer;
+    // check is pre-segment buffer, with prefix 0xFFFA
+    // if True, the following should be 0xFFA, 0x41, [ctx]      --coreseek
+    m_bPreTokenized = false;
+    if(iLength > 4)
+    {
+        // there is a ' ' (space, 32) as padding. might not true
+        unsigned char mask[] = {32, 239, 191, 186, 65};
+        unsigned char mask_bare[] = {239, 191, 186, 65};
+        if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+            // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+            m_bPreTokenized = true;
+            m_pBuffer += 5;
+        }else
+        if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+            m_bPreTokenized = true;
+            m_pBuffer += 4;
+        }
+    }
+
 	m_pBufferMax = sBuffer + iLength;
-	m_pCur = sBuffer;
+    m_pCur = m_pBuffer;
 	m_pTokenStart = m_pTokenEnd = NULL;
 	m_pBlendStart = m_pBlendEnd = NULL;

@@ -5997,7 +6144,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
 	m_bTokenBoundary = false;
 	m_bWasSynonym = false;

-	return m_bHasBlend
+    return m_bHasBlend
 		? DoGetToken<IS_QUERY,true>()
 		: DoGetToken<IS_QUERY,false>();
 }
@@ -6412,6 +6559,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	assert ( m_iNgramLen==1 );
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+        :CSphTokenizer_UTF8<IS_QUERY>()
+        , m_segoffset(0)
+{
+    //over ride charmap
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+    m_pAccumSeg = m_sAccumSeg;
+    //m_iLastTokenBufferLen = 0;
+    m_iLastTokenLenMMSeg = 0;
+
+    m_mgr = NULL;
+    m_seg = NULL;
+    m_tokenlens.Reserve(1024*512);  // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+    CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+    else
+        sphDie ( " Tokenizer initialization failure. " );
+    m_segoffset = 0;
+    m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+    m_tokenlens.Reset();
+    m_tokenpos = 0;
+    {
+    	u2 len = 0, symlen = 0;
+        while(1){
+            len = 0;
+            char* tok = (char*)seg->peekToken(len,symlen);
+            if(!tok || !*tok || !len)
+                break;
+            seg->popToken(len);
+
+            m_tokenlens.Add(len);
+            //printf("%*.*s/p ",symlen,symlen,tok);
+        }
+    }
+}
+
+template < bool IS_QUERY >
+bool	CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+    // this code might have bug, but as it will removed in next release...
+    size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+    //if(offset == 0)	return false;
+    //printf("pcur: %s\n", pCur);
+
+    //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+    {
+        u2 len = 0, symlen = 0;
+        while(m_segoffset < offset) {
+            //tok = (const char*)seg->peekToken(len, symlen);
+            //seg->popToken(len);
+            len = m_tokenlens[m_tokenpos];
+            m_tokenpos ++;
+            m_segoffset += len;
+            //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+            if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+                //break?
+                break;
+            }
+        }
+        /*
+        printf("\n");
+        printf("seg_off %d vs off %d\n", m_segoffset, offset);
+        if(m_segoffset != offset)
+        	printf("seg_pcur: %s\n", pCur);
+        */
+        return (m_segoffset == offset);
+    } //end if seg
+    return true;
+}
+
+template < bool IS_QUERY >
+BYTE *	CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+    //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+    m_iLastTokenLenMMSeg = 0;
+    //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+    while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+    {
+        BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+        //printf("utf8_token: %s \t ", tok);
+        if(!tok){
+            m_iLastTokenLenMMSeg = 0;
+            return NULL;
+        }
+
+        int token_buf_len = strlen((const char*)tok);
+
+        if(m_pAccumSeg == m_sAccumSeg)
+            m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+        if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN )  {
+            ::memcpy(m_pAccumSeg, tok, token_buf_len);
+            m_pAccumSeg += token_buf_len;
+            m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+        }
+    }
+    {
+        *m_pAccumSeg = 0;
+        //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+        m_pAccumSeg = m_sAccumSeg;
+
+        return m_sAccumSeg;
+    }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+    CSphTokenizerBase * pClone;
+    if ( eMode!=SPH_CLONE_INDEX ) {
+        pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+    }else{
+        pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+    }
+    pClone->CloneBase ( this, eMode );
+    return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+    return NULL;
+}
+
+#endif

 //////////////////////////////////////////////////////////////////////////

@@ -9678,6 +9971,7 @@ void CSphIndex::SetupQueryTokenizer()
 	// create and setup a master copy of query time tokenizer
 	// that we can then use to create lightweight clones
 	SafeDelete ( m_pQueryTokenizer );
+    m_pTokenizer->ReloadSegDictionary();
 	m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
 	if ( IsStarDict() )
 	{
@@ -25691,6 +25985,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+    m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
 	m_bIndexSP = tSettings.m_bIndexSP;
 	m_dPrefixFields = tSettings.m_dPrefixFields;
 	m_dInfixFields = tSettings.m_dInfixFields;
@@ -26295,11 +26590,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
 	{
+        //FIXME: dump token to console --coreseek
+        //debug dump
+        if(m_pTokenizer->DumpToken()) {
+            printf("%s_x ", sWord); // make the same as pre-tokenized text.
+        }
+
+        // fix sWork if in pre-tokenized mode.
+        int iBytes = strlen ( (const char*)sWord );
+        bool bAdvancePos = true;
+        if(m_pTokenizer->IsPreTokenized()) {
+            // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+            if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+                bAdvancePos = false;  // not an advance token.
+            sWord[iBytes-2] = '\0'; // change token_x   -> token\0x
+            iBytes -= 2;    // decrease length
+        }
+
 		m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );

 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );

-		if ( !bPayload )
+        if ( !bPayload && bAdvancePos)
 		{
 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
 			if ( m_pTokenizer->GetBoundary() )
@@ -26311,7 +26623,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b

 		if ( bGlobalPartialMatch )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD;
 			sBuf[iBytes+1] = '\0';
@@ -26321,7 +26633,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 		ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
 		if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
 			sBuf[iBytes+1] = '\0';
@@ -26357,6 +26669,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
 		} else
 			m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+        // works only when mmseg is on.
+        // zh_cn only GetThesaurus
+        {
+            int iBytes = strlen ( (const char*)sWord );
+            const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+            if(tbuf_ptr) {
+                while(*tbuf_ptr) {
+                    size_t len = strlen((const char*)tbuf_ptr);
+                    SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+                    if ( iWord ) {
+                        m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+                        // mmseg; do not inc step for we are in 'one' hit.
+                        //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+                    }
+                    tbuf_ptr += len + 1; //move next
+                }
+            }
+            //end if buf
+        }//end GetThesaurus
+#endif
 	}

 	m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 7550ed1..a1579dd 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
 	#define	USE_RE2			0	/// whether to compile RE2 support
 	#define USE_RLP			0	/// whether to compile RLP support
 	#define USE_WINDOWS		1	/// whether to compile for Windows
+    #define USE_MMSEG		1   /// enable mmseg
 	#define USE_SYSLOG		0	/// whether to use syslog for logging

 	#define UNALIGNED_RAM_ACCESS	1
@@ -200,7 +201,7 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #endif

 #define SPHINX_VERSION			"2.3.1" SPHINX_BITS_TAG SPHINX_TAG " (" SPH_SVN_TAGREV ")"
-#define SPHINX_BANNER			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2015, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2015, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
 #define SPHINX_SEARCHD_PROTO	1
 #define SPHINX_CLIENT_VERSION	1

@@ -208,6 +209,10 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPH_MAX_FILENAME_LEN	512
 #define SPH_MAX_FIELDS			256

+#define CORESEEK_BANNER			"Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
 /////////////////////////////////////////////////////////////////////////////

 extern int64_t g_iIndexerCurrentDocID;
@@ -491,7 +496,10 @@ struct CSphTokenizerSettings
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
-
+    int                 m_iDebug;           ///< is in tokenizer debug mode.
+#if USE_MMSEG
+    CSphString			m_sDictPath;        ///coreseek: where to find segmentor's dict.
+#endif
 						CSphTokenizerSettings ();
 };

@@ -602,11 +610,16 @@ public:

 	/// get synonym file info
 	virtual const CSphSavedFile &	GetSynFileInfo () const { return m_tSynFileInfo; }
+    /// mark as debug tokenizer's output --coreseek -mmseg
+    virtual int					DumpToken () { return m_tSettings.m_iDebug; }

 public:
 	/// pass next buffer
 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;

+    /// is pre-tokenized            --coreseek
+    virtual bool                    IsPreTokenized()    {   return false;   }
+
 	/// set current index schema (only intended for the token filter plugins)
 	virtual bool					SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }

@@ -685,6 +698,10 @@ public:
 	/// set new buffer ptr (must be within current bounds)
 	virtual void					SetBufferPtr ( const char * sNewPtr ) = 0;

+#if USE_MMSEG
+    virtual const BYTE*				GetThesaurus(BYTE * , int  ) { return NULL; }
+    virtual void                    ReloadSegDictionary()    {   return; }       // reload mmseg's dictionary.
+#endif
 	/// get settings hash
 	virtual uint64_t				GetSettingsFNV () const;

@@ -709,6 +726,9 @@ protected:
 	CSphLowercaser					m_tLC;						///< my lowercaser
 	int								m_iLastTokenLen;			///< last token length, in codepoints
 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+    int								m_iLastTokenBufferLen;		///< the buffer length -- coreseek;	use in mmseg patch.
+#endif
 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
 	int								m_iBoundaryOffset;			///< boundary character offset (in bytes)
 	bool							m_bWasSpecial;				///< special token flag
@@ -1814,6 +1834,7 @@ struct CSphSourceSettings
 	int		m_iStopwordStep;	///< position step on stopword token (default is 1)
 	bool	m_bIndexSP;			///< whether to index sentence and paragraph delimiters
 	bool	m_bIndexFieldLens;	///< whether to index field lengths
+    int		m_bDebugDump;		///< mmseg charset debug output feature

 	CSphVector<CSphString>	m_dPrefixFields;	///< list of prefix fields
 	CSphVector<CSphString>	m_dInfixFields;		///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index a9e5287..644a43b 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -407,6 +407,8 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "min_word_len",			0, NULL },
 	{ "charset_type",			KEY_REMOVED, NULL },
 	{ "charset_table",			0, NULL },
+    { "charset_dictpath",		0, NULL }, //coreseek: mmseg's dictionary path
+    { "charset_debug",			0, NULL }, //coreseek: debug output tokens
 	{ "ignore_chars",			0, NULL },
 	{ "min_prefix_len",			0, NULL },
 	{ "min_infix_len",			0, NULL },
@@ -1142,7 +1144,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 {
 	tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );

-	if ( hIndex ( "ngram_chars" ) )
+    if(hIndex("charset_debug"))
+        tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+    if ( hIndex ( "ngram_chars" ) )
 	{
 		if ( tSettings.m_iNgramLen )
 			tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1150,6 +1155,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}

+#if USE_MMSEG
+    //XXX:fixme : sphinx changes tokenizer create process
+    if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+    {
+        tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+        tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+    }
+#endif
+
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1283,6 +1297,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
 	tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
 	tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
 	tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+    tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;

 	// prefix/infix fields
 	CSphString sFields;
@@ -1583,12 +1598,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	while ( !sOptConfig )
 	{
 #ifdef SYSCONFDIR
-		sOptConfig = SYSCONFDIR "/sphinx.conf";
+        sOptConfig = SYSCONFDIR "/csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;
 #endif

-		sOptConfig = "./sphinx.conf";
+        sOptConfig = "./csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;

@@ -1599,9 +1614,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	if ( !sOptConfig )
 		sphDie ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-		SYSCONFDIR "/sphinx.conf, "
+        SYSCONFDIR "/csft.conf, "
 #endif
-		"./sphinx.conf)" );
+        "./csft.conf)" );

 	if ( !bQuiet )
 		fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 9374980..a350b42 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -146,6 +146,7 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+    , TOKENIZER_ZHCN_UTF8 = 4
 };

 /// load config file

## csft-sphinx-2.3.2.patch
diff --git a/.gitignore b/.gitignore
index 6701f58..43d9f68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,10 @@
 /test/ql/data/*.lock
 /test/ql/*.class
 /test/ql/*.exe
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
diff --git a/acinclude.m4 b/acinclude.m4
index e09697e..3ae78b0 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -297,6 +297,95 @@ ERROR: cannot find PostgreSQL libraries. If you want to compile with PosgregSQL
 fi
 ])

+dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+	do
+		if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+		then
+			MMSEG_CFLAGS="-I$CANDIDATE"
+			break
+		fi
+	done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+	for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+		"/usr/local/lib" "/usr/local/mmseg/lib" \
+		"/usr/local/lib/mmseg" "/usr/lib" \
+		"/opt/mmseg/lib"
+	do
+		if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+		then
+			MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+			break
+		fi
+	done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+	AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+	[ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+	MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+	AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+	[ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+	# Trim trailing '.libs' if user passed it in --with-mysql-libs option
+	ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+		-e 's+.libs/$++'`
+	MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+	AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
 dnl ---------------------------------------------------------------------------
 dnl Macro: AC_CHECK_LIBSTEMMER
 dnl Check the libstemmer first in custom include path in --with-libstemmer=*
diff --git a/configure.ac b/configure.ac
index 3962440..97d6914 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,6 +69,7 @@ fi

 AC_PROG_CC
 AC_PROG_CXX
+AM_PROG_AR
 AC_PROG_RANLIB

 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -439,6 +440,24 @@ else
 fi
 AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )

+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+            AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+            [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+       AC_MSG_RESULT([yes])
+    AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+    AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+    AC_SUBST([MMSEG_LIBS])
+    AC_SUBST([MMSEG_CFLAGS])
+else
+       AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
 # add macports include directory
 if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
    MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -493,7 +512,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
 dnl ---

 # we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"


 AC_ARG_WITH([libexpat],
diff --git a/libstemmer_c/Makefile.am b/libstemmer_c/Makefile.am
index a973921..fb93b5f 100644
--- a/libstemmer_c/Makefile.am
+++ b/libstemmer_c/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 if USE_LIBSTEMMER
 noinst_LIBRARIES = libstemmer.a
 include $(srcdir)/mkinc.mak
diff --git a/src/Makefile.am b/src/Makefile.am
index d5214c6..be187dd 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
 	sphinxsoundex.cpp sphinxmetaphone.cpp sphinxstemen.cpp sphinxstemru.cpp sphinxstemcz.cpp sphinxstemar.cpp \
 	sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \
@@ -31,5 +32,9 @@ RLP_INC =
 endif

 AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
 COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
 LDADD = $(COMMON_LIBS)
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 3d136ad..07adc4b 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1722,7 +1722,7 @@ int main ( int argc, char ** argv )
 				"\n"
 				"Options are:\n"
 				"--config <file>\t\tread configuration from specified file\n"
-				"\t\t\t(default is sphinx.conf)\n"
+                "\t\t\t(default is csft.conf)\n"
 				"--all\t\t\treindex all configured indexes\n"
 				"--quiet\t\t\tbe quiet, only print errors\n"
 				"--verbose\t\tverbose indexing issues report\n"
@@ -1751,8 +1751,8 @@ int main ( int argc, char ** argv )
 				"--keep-attrs\t\tretain attributes from the old index"
 				"\n"
 				"Examples:\n"
-				"indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
-				"indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+                "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+                "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
 		}

 		return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index a845add..6ec9d11 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -18917,7 +18917,7 @@ void ShowHelp ()
 		"Options are:\n"
 		"-h, --help\t\tdisplay this help message\n"
 		"-c, --config <file>\tread configuration from specified file\n"
-		"\t\t\t(default is sphinx.conf)\n"
+        "\t\t\t(default is csft.conf)\n"
 		"--stop\t\t\tsend SIGTERM to currently running searchd\n"
 		"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
 		"--status\t\tget ant print status variables\n"
@@ -18954,9 +18954,9 @@ void ShowHelp ()
 		"--safetrace\t\tonly use system backtrace() call in crash reports\n"
 		"\n"
 		"Examples:\n"
-		"searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+        "searchd --config /usr/local/sphinx/etc/csft.conf\n"
 #if USE_WINDOWS
-		"searchd --install --config c:\\sphinx\\sphinx.conf\n"
+        "searchd --install --config c:\\sphinx\\csft.conf\n"
 #endif
 		);
 }
@@ -22508,12 +22508,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	while ( !g_sConfigFile.cstr() )
 	{
 #ifdef SYSCONFDIR
-		g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+        g_sConfigFile = SYSCONFDIR "/";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;
 #endif

-		g_sConfigFile = "./sphinx.conf";
+        g_sConfigFile = "./";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;

@@ -22524,9 +22524,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	if ( !g_sConfigFile.cstr () )
 		sphFatal ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-			SYSCONFDIR "/sphinx.conf, "
+            SYSCONFDIR "/csft.conf, "
 #endif
-			"./sphinx.conf)." );
+            "./csft.conf)." );

 	sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 97d72c3..7a666da 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -133,6 +133,16 @@
 #endif
 #endif

+#if ( USE_WINDOWS && USE_MMSEG )
+	#if _DEBUG
+		#pragma comment(linker, "/defaultlib:libcss_d.lib")
+	#else
+		#pragma comment(linker, "/defaultlib:libcss.lib")
+	#endif
+	#pragma message("Automatically linking with libcss.lib")
+    #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 // logf() is not there sometimes (eg. Solaris 9)
@@ -2417,10 +2427,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
 public:
 								CSphTokenizer_UTF8 ();
 	virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual bool                IsPreTokenized()    {   return m_bPreTokenized;   }
 	virtual BYTE *				GetToken ();
 	virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
 	virtual int					GetCodepointLength ( int iCode ) const;
 	virtual int					GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+    bool    m_bPreTokenized;
 };


@@ -2441,6 +2455,78 @@ protected:
 	CSphString			m_sNgramCharsStr;
 };

+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+                                CSphTokenizer_UTF8MMSeg ();
+                                ~CSphTokenizer_UTF8MMSeg() {
+                                    if(m_seg){
+                                        SafeDelete ( m_seg );
+                                    }
+                                }
+
+    virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual BYTE *				GetToken ();
+    virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
+    virtual const BYTE*			GetThesaurus(BYTE * sBuffer, int iLength );
+    bool                        IsSegment(const BYTE * pCur);
+
+    CSphTokenizerBase* SetDictPath(const char* path) {	m_dictpath = path; return this; }
+
+    virtual const char *	GetBufferPtr () const		{ 		return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;   }
+
+    virtual const char *			GetTokenStart () const		{ 	return m_segToken;      }
+
+    virtual int						GetLastTokenLen () const    {   return m_iLastTokenLenMMSeg;    }
+
+    virtual void                    ReloadSegDictionary()    {
+        if(m_seg){
+            SafeDelete ( m_seg );
+        }
+        m_seg = NULL;
+
+        if(m_mgr) {
+            SegmenterManagerSingleInstance::Free(); // free preexist instance.
+            m_mgr = NULL;
+        }
+    }
+protected:
+    char*               m_segToken;
+    size_t              m_segoffset;
+    int                 m_iLastTokenLenMMSeg;
+    BYTE				m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
+    BYTE *				m_pAccumSeg;							///< current accumulator position
+    CSphVector<u2>      m_tokenlens;
+    int                 m_tokenpos;
+protected:
+    // virtual bool				IsSegment(const BYTE * pCur);
+    CSphString m_dictpath;
+
+    // mmseg related
+    css::Segmenter* m_seg;
+    css::SegmenterManager* m_mgr;
+    css::Segmenter* GetSegmenter(const char* dict_path){
+        int nRet = 0;
+        if(!m_mgr) {
+            m_mgr = SegmenterManagerSingleInstance::Get();
+            if(dict_path)
+                nRet = m_mgr->init(dict_path);
+        }
+        if(nRet == 0 && !m_seg)
+            m_seg = m_mgr->getSegmenter(false);
+        return m_seg;
+    }
+};
+
+#endif

 struct CSphNormalForm
 {
@@ -2782,6 +2868,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }

+#if USE_MMSEG
+ISphTokenizer *	sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+    CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+    tokenizer->SetDictPath(dict_path);
+    return tokenizer;
+}
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 enum
@@ -3344,6 +3439,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
 	: m_iType				( TOKENIZER_UTF8 )
 	, m_iMinWordLen			( 1 )
 	, m_iNgramLen			( 0 )
+    , m_iDebug				( 0 )
 {
 }

@@ -3355,7 +3451,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;

 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
 	{
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
@@ -3383,7 +3483,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 	tSettings.m_sIgnoreChars = tReader.GetString ();
 	tSettings.m_iNgramLen = tReader.GetDword ();
 	tSettings.m_sNgramChars = tReader.GetString ();
-	if ( uVersion>=15 )
+#if USE_MMSEG
+    //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+    tSettings.m_sDictPath		= tReader.GetString ();
+#endif
+    if ( uVersion>=15 )
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
@@ -3414,6 +3518,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
 	tWriter.PutDword ( tSettings.m_iNgramLen );
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+    // if turn mmseg off, the index(s) are compat again.
+    tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
 }
@@ -3688,6 +3796,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+        case TOKENIZER_ZHCN_UTF8:   pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
 			return NULL;
@@ -4760,7 +4871,24 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
 {
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
-	m_bHasBlend = false;
+
+    // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+    // Here just make CJK Charactor will remain. --coreseek
+    // 4e00 - 9fff CJK unified ideographs
+    // 3000 - 303f CJK symbols and punctuation
+    // 3040 - 30ff Hiragana/Katagana
+    // ff00 - ffff half/fullwidth forms
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+    // ENDCJK
+    m_bPreTokenized = false;    // by default use original route.
+
+    m_bHasBlend = false;
 }


@@ -4770,10 +4898,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 	// check that old one is over and that new length is sane
 	assert ( iLength>=0 );

-	// set buffer
+    // set buffer
 	m_pBuffer = sBuffer;
+    // check is pre-segment buffer, with prefix 0xFFFA
+    // if True, the following should be 0xFFA, 0x41, [ctx]      --coreseek
+    m_bPreTokenized = false;
+    if(iLength > 4)
+    {
+        // there is a ' ' (space, 32) as padding. might not true
+        unsigned char mask[] = {32, 239, 191, 186, 65};
+        unsigned char mask_bare[] = {239, 191, 186, 65};
+        if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+            // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+            m_bPreTokenized = true;
+            m_pBuffer += 5;
+        }else
+        if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+            m_bPreTokenized = true;
+            m_pBuffer += 4;
+        }
+    }
+
 	m_pBufferMax = sBuffer + iLength;
-	m_pCur = sBuffer;
+    m_pCur = m_pBuffer;
 	m_pTokenStart = m_pTokenEnd = NULL;
 	m_pBlendStart = m_pBlendEnd = NULL;

@@ -4791,7 +4938,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
 	m_bTokenBoundary = false;
 	m_bWasSynonym = false;

-	return m_bHasBlend
+    return m_bHasBlend
 		? DoGetToken<IS_QUERY,true>()
 		: DoGetToken<IS_QUERY,false>();
 }
@@ -5209,6 +5356,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	assert ( m_iNgramLen==1 );
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+        :CSphTokenizer_UTF8<IS_QUERY>()
+        , m_segoffset(0)
+{
+    //over ride charmap
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
+
+    CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+    m_pAccumSeg = m_sAccumSeg;
+    //m_iLastTokenBufferLen = 0;
+    m_iLastTokenLenMMSeg = 0;
+
+    m_mgr = NULL;
+    m_seg = NULL;
+    m_tokenlens.Reserve(1024*512);  // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+    CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+    else
+        sphDie ( " Tokenizer initialization failure. " );
+    m_segoffset = 0;
+    m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+    m_tokenlens.Reset();
+    m_tokenpos = 0;
+    {
+    	u2 len = 0, symlen = 0;
+        while(1){
+            len = 0;
+            char* tok = (char*)seg->peekToken(len,symlen);
+            if(!tok || !*tok || !len)
+                break;
+            seg->popToken(len);
+
+            m_tokenlens.Add(len);
+            //printf("%*.*s/p ",symlen,symlen,tok);
+        }
+    }
+}
+
+template < bool IS_QUERY >
+bool	CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+    // this code might have bug, but as it will removed in next release...
+    size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+    //if(offset == 0)	return false;
+    //printf("pcur: %s\n", pCur);
+
+    //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+    {
+        u2 len = 0, symlen = 0;
+        while(m_segoffset < offset) {
+            //tok = (const char*)seg->peekToken(len, symlen);
+            //seg->popToken(len);
+            len = m_tokenlens[m_tokenpos];
+            m_tokenpos ++;
+            m_segoffset += len;
+            //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+            if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+                //break?
+                break;
+            }
+        }
+        /*
+        printf("\n");
+        printf("seg_off %d vs off %d\n", m_segoffset, offset);
+        if(m_segoffset != offset)
+        	printf("seg_pcur: %s\n", pCur);
+        */
+        return (m_segoffset == offset);
+    } //end if seg
+    return true;
+}
+
+template < bool IS_QUERY >
+BYTE *	CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+    //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+    m_iLastTokenLenMMSeg = 0;
+    //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+    while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+    {
+        BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+        //printf("utf8_token: %s \t ", tok);
+        if(!tok){
+            m_iLastTokenLenMMSeg = 0;
+            return NULL;
+        }
+
+        int token_buf_len = strlen((const char*)tok);
+
+        if(m_pAccumSeg == m_sAccumSeg)
+            m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+        if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN )  {
+            ::memcpy(m_pAccumSeg, tok, token_buf_len);
+            m_pAccumSeg += token_buf_len;
+            m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+        }
+    }
+    {
+        *m_pAccumSeg = 0;
+        //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+        m_pAccumSeg = m_sAccumSeg;
+
+        return m_sAccumSeg;
+    }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+    CSphTokenizerBase * pClone;
+    if ( eMode!=SPH_CLONE_INDEX ) {
+        pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+    }else{
+        pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+    }
+    pClone->CloneBase ( this, eMode );
+    return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+    return NULL;
+}
+
+#endif

 //////////////////////////////////////////////////////////////////////////

@@ -8539,6 +8832,7 @@ void CSphIndex::SetupQueryTokenizer()
 	// create and setup a master copy of query time tokenizer
 	// that we can then use to create lightweight clones
 	SafeDelete ( m_pQueryTokenizer );
+    m_pTokenizer->ReloadSegDictionary();
 	m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
 	sphSetupQueryTokenizer ( m_pQueryTokenizer, IsStarDict(), m_tSettings.m_bIndexExactWords );
 }
@@ -24721,6 +25015,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+    m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
 	m_bIndexSP = tSettings.m_bIndexSP;
 	m_dPrefixFields = tSettings.m_dPrefixFields;
 	m_dInfixFields = tSettings.m_dInfixFields;
@@ -25333,9 +25628,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
 	{
+                //FIXME: dump token to console --coreseek
+                //debug dump
+                if(m_pTokenizer->DumpToken()) {
+                    printf("%s_x ", sWord); // make the same as pre-tokenized text.
+                }
+
+                // fix sWork if in pre-tokenized mode.
+                int iBytes = strlen ( (const char*)sWord );
+                bool bAdvancePos = true;
+                if(m_pTokenizer->IsPreTokenized()) {
+                    // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+                    if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+                        bAdvancePos = false;  // not an advance token.
+                    sWord[iBytes-2] = '\0'; // change token_x   -> token\0x
+                    iBytes -= 2;    // decrease length
+                }
+
+
 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );

-		if ( !bPayload )
+
+                if ( !bPayload && bAdvancePos)
 		{
 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
 			if ( m_pTokenizer->GetBoundary() )
@@ -25347,7 +25661,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b

 		if ( bGlobalPartialMatch )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD;
 			sBuf[iBytes+1] = '\0';
@@ -25357,7 +25671,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 		ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
 		if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
 			sBuf[iBytes+1] = '\0';
@@ -25395,6 +25709,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
 		} else
 			m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+        // works only when mmseg is on.
+        // zh_cn only GetThesaurus
+        {
+            int iBytes = strlen ( (const char*)sWord );
+            const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+            if(tbuf_ptr) {
+                while(*tbuf_ptr) {
+                    size_t len = strlen((const char*)tbuf_ptr);
+                    SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+                    if ( iWord ) {
+                        m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+                        // mmseg; do not inc step for we are in 'one' hit.
+                        //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+                    }
+                    tbuf_ptr += len + 1; //move next
+                }
+            }
+            //end if buf
+        }//end GetThesaurus
+#endif
 	}

 	m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 8d033f6..a22ed81 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
 	#define	USE_RE2			0	/// whether to compile RE2 support
 	#define USE_RLP			0	/// whether to compile RLP support
 	#define USE_WINDOWS		1	/// whether to compile for Windows
+    #define USE_MMSEG		1   /// enable mmseg
 	#define USE_SYSLOG		0	/// whether to use syslog for logging
 	#define HAVE_STRNLEN	1

@@ -212,7 +213,7 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPHINX_VERSION_NUMBERS    "2.3.2"

 #define SPHINX_VERSION           SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
-#define SPHINX_BANNER			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
 #define SPHINX_SEARCHD_PROTO	1
 #define SPHINX_CLIENT_VERSION	1

@@ -220,6 +221,10 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPH_MAX_FILENAME_LEN	512
 #define SPH_MAX_FIELDS			256

+#define CORESEEK_BANNER			"Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
 /////////////////////////////////////////////////////////////////////////////

 extern int64_t g_iIndexerCurrentDocID;
@@ -497,7 +502,10 @@ struct CSphTokenizerSettings
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
-
+    int                 m_iDebug;           ///< is in tokenizer debug mode.
+#if USE_MMSEG
+    CSphString			m_sDictPath;        ///coreseek: where to find segmentor's dict.
+#endif
 						CSphTokenizerSettings ();
 };

@@ -597,11 +605,16 @@ public:

 	/// get synonym file info
 	virtual const CSphSavedFile &	GetSynFileInfo () const { return m_tSynFileInfo; }
+    /// mark as debug tokenizer's output --coreseek -mmseg
+    virtual int					DumpToken () { return m_tSettings.m_iDebug; }

 public:
 	/// pass next buffer
 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;

+    /// is pre-tokenized            --coreseek
+    virtual bool                    IsPreTokenized()    {   return false;   }
+
 	/// set current index schema (only intended for the token filter plugins)
 	virtual bool					SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }

@@ -679,6 +692,10 @@ public:
 	/// set new buffer ptr (must be within current bounds)
 	virtual void					SetBufferPtr ( const char * sNewPtr ) = 0;

+#if USE_MMSEG
+    virtual const BYTE*				GetThesaurus(BYTE * , int  ) { return NULL; }
+    virtual void                    ReloadSegDictionary()    {   return; }       // reload mmseg's dictionary.
+#endif
 	/// get settings hash
 	virtual uint64_t				GetSettingsFNV () const;

@@ -701,6 +718,9 @@ protected:
 	CSphLowercaser					m_tLC;						///< my lowercaser
 	int								m_iLastTokenLen;			///< last token length, in codepoints
 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+    int								m_iLastTokenBufferLen;		///< the buffer length -- coreseek;	use in mmseg patch.
+#endif
 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
 	int								m_iBoundaryOffset;			///< boundary character offset (in bytes)
 	bool							m_bWasSpecial;				///< special token flag
@@ -1820,6 +1840,7 @@ struct CSphSourceSettings
 	int		m_iStopwordStep;	///< position step on stopword token (default is 1)
 	bool	m_bIndexSP;			///< whether to index sentence and paragraph delimiters
 	bool	m_bIndexFieldLens;	///< whether to index field lengths
+    int		m_bDebugDump;		///< mmseg charset debug output feature

 	CSphVector<CSphString>	m_dPrefixFields;	///< list of prefix fields
 	CSphVector<CSphString>	m_dInfixFields;		///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 7d975c6..7528970 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -529,6 +529,8 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "min_word_len",			0, NULL },
 	{ "charset_type",			KEY_REMOVED, NULL },
 	{ "charset_table",			0, NULL },
+    { "charset_dictpath",		0, NULL }, //coreseek: mmseg's dictionary path
+    { "charset_debug",			0, NULL }, //coreseek: debug output tokens
 	{ "ignore_chars",			0, NULL },
 	{ "min_prefix_len",			0, NULL },
 	{ "min_infix_len",			0, NULL },
@@ -1267,7 +1269,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 {
 	tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );

-	if ( hIndex ( "ngram_chars" ) )
+    if(hIndex("charset_debug"))
+        tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+    if ( hIndex ( "ngram_chars" ) )
 	{
 		if ( tSettings.m_iNgramLen )
 			tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1275,6 +1280,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}

+#if USE_MMSEG
+    //XXX:fixme : sphinx changes tokenizer create process
+    if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+    {
+        tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+        tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+    }
+#endif
+
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1408,6 +1422,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
 	tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
 	tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
 	tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+    tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;

 	// prefix/infix fields
 	CSphString sFields;
@@ -1715,12 +1730,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	while ( !sOptConfig )
 	{
 #ifdef SYSCONFDIR
-		sOptConfig = SYSCONFDIR "/sphinx.conf";
+        sOptConfig = SYSCONFDIR "/csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;
 #endif

-		sOptConfig = "./sphinx.conf";
+        sOptConfig = "./csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;

@@ -1731,9 +1746,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	if ( !sOptConfig )
 		sphDie ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-		SYSCONFDIR "/sphinx.conf, "
+        SYSCONFDIR "/csft.conf, "
 #endif
-		"./sphinx.conf)" );
+        "./csft.conf)" );

 	if ( !bQuiet )
 		fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 776386c..1221a82 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,7 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+    , TOKENIZER_ZHCN_UTF8 = 4
 };

 /// load config file