fffonion/git-diff.patch

## git-diff.patch
From c7db4c68750b17532e78da313c5b5d25a1d00a3e Mon Sep 17 00:00:00 2001
From: nzinfo <limn@coreseek.com>
Date: Thu, 13 Aug 2015 23:16:20 +0800
Subject: [PATCH] add mmseg support, begin fix pre-token

---
 acinclude.m4        |  89 ++++++++++++++
 configure.ac        |  21 +++-
 src/Makefile.am     |   4 +
 src/sphinx.cpp      | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 src/sphinx.h        |  19 ++-
 src/sphinxutils.cpp |  17 ++-
 src/sphinxutils.h   |   1 +
 7 files changed, 486 insertions(+), 12 deletions(-)

diff --git a/acinclude.m4 b/acinclude.m4
index e09697ea..3ae78b01 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -298,6 +298,95 @@ fi
 ])

 dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+	do
+		if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+		then
+			MMSEG_CFLAGS="-I$CANDIDATE"
+			break
+		fi
+	done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+	for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+		"/usr/local/lib" "/usr/local/mmseg/lib" \
+		"/usr/local/lib/mmseg" "/usr/lib" \
+		"/opt/mmseg/lib"
+	do
+		if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+		then
+			MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+			break
+		fi
+	done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+	AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+	[ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+	MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+	AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+	[ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+	# Trim trailing '.libs' if user passed it in --with-mysql-libs option
+	ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+		-e 's+.libs/$++'`
+	MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+	AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+	AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+	AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
+dnl ---------------------------------------------------------------------------
 dnl Macro: AC_CHECK_LIBSTEMMER
 dnl Check the libstemmer first in custom include path in --with-libstemmer=*
 dnl If not given, try to guess common shared libs, and finally fall back into
diff --git a/configure.ac b/configure.ac
index d56fbd95..e08dc886 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,6 +69,7 @@ fi

 AC_PROG_CC
 AC_PROG_CXX
+AM_PROG_AR
 AC_PROG_RANLIB

 AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -429,6 +430,24 @@ else
 fi
 AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )

+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+            AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+            [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+       AC_MSG_RESULT([yes])
+    AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+    AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+    AC_SUBST([MMSEG_LIBS])
+    AC_SUBST([MMSEG_CFLAGS])
+else
+       AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
 # add macports include directory
 if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
    MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -483,7 +502,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
 dnl ---

 # we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"


 AC_ARG_WITH([libexpat],
diff --git a/src/Makefile.am b/src/Makefile.am
index 3129f594..8e696075 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
 endif

 AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
 COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
 LDADD = $(COMMON_LIBS)
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 6c4a4097..70dceaf6 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -141,6 +141,16 @@
 	#pragma message("Automatically linking with btutils.lib")
 #endif

+#if ( USE_WINDOWS && USE_MMSEG )
+	#if _DEBUG
+		#pragma comment(linker, "/defaultlib:libcss_d.lib")
+	#else
+		#pragma comment(linker, "/defaultlib:libcss.lib")
+	#endif
+	#pragma message("Automatically linking with libcss.lib")
+    #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 // logf() is not there sometimes (eg. Solaris 9)
@@ -2552,10 +2562,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
 public:
 								CSphTokenizer_UTF8 ();
 	virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual bool                IsPreTokenized()    {   return m_bPreTokenized;   }
 	virtual BYTE *				GetToken ();
 	virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
 	virtual int					GetCodepointLength ( int iCode ) const;
 	virtual int					GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+    bool    m_bPreTokenized;
 };


@@ -2576,6 +2590,78 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
 	CSphString			m_sNgramCharsStr;
 };

+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+                                CSphTokenizer_UTF8MMSeg ();
+                                ~CSphTokenizer_UTF8MMSeg() {
+                                    if(m_seg){
+                                        SafeDelete ( m_seg );
+                                    }
+                                }
+
+    virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
+    virtual BYTE *				GetToken ();
+    virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
+    virtual const BYTE*			GetThesaurus(BYTE * sBuffer, int iLength );
+    bool                        IsSegment(const BYTE * pCur);
+
+    CSphTokenizerBase* SetDictPath(const char* path) {	m_dictpath = path; return this; }
+
+    virtual const char *	GetBufferPtr () const		{ 		return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;   }
+
+    virtual const char *			GetTokenStart () const		{ 	return m_segToken;      }
+
+    virtual int						GetLastTokenLen () const    {   return m_iLastTokenLenMMSeg;    }
+
+    virtual void                    ReloadSegDictionary()    {
+        if(m_seg){
+            SafeDelete ( m_seg );
+        }
+        m_seg = NULL;
+
+        if(m_mgr) {
+            SegmenterManagerSingleInstance::Free(); // free preexist instance.
+            m_mgr = NULL;
+        }
+    }
+protected:
+    char*               m_segToken;
+    size_t              m_segoffset;
+    int                 m_iLastTokenLenMMSeg;
+    BYTE				m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
+    BYTE *				m_pAccumSeg;							///< current accumulator position
+    CSphVector<u2>      m_tokenlens;
+    int                 m_tokenpos;
+protected:
+    // virtual bool				IsSegment(const BYTE * pCur);
+    CSphString m_dictpath;
+
+    // mmseg related
+    css::Segmenter* m_seg;
+    css::SegmenterManager* m_mgr;
+    css::Segmenter* GetSegmenter(const char* dict_path){
+        int nRet = 0;
+        if(!m_mgr) {
+            m_mgr = SegmenterManagerSingleInstance::Get();
+            if(dict_path)
+                nRet = m_mgr->init(dict_path);
+        }
+        if(nRet == 0 && !m_seg)
+            m_seg = m_mgr->getSegmenter(false);
+        return m_seg;
+    }
+};
+
+#endif

 struct CSphNormalForm
 {
@@ -3795,6 +3881,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }

+#if USE_MMSEG
+ISphTokenizer *	sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+    CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+    tokenizer->SetDictPath(dict_path);
+    return tokenizer;
+}
+#endif
+
 /////////////////////////////////////////////////////////////////////////////

 enum
@@ -4380,6 +4475,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
 	: m_iType				( TOKENIZER_UTF8 )
 	, m_iMinWordLen			( 1 )
 	, m_iNgramLen			( 0 )
+    , m_iDebug				( 0 )
 {
 }

@@ -4391,7 +4487,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;

 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+    if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
 	{
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
@@ -4419,7 +4519,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 	tSettings.m_sIgnoreChars = tReader.GetString ();
 	tSettings.m_iNgramLen = tReader.GetDword ();
 	tSettings.m_sNgramChars = tReader.GetString ();
-	if ( uVersion>=15 )
+#if USE_MMSEG
+    //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+    tSettings.m_sDictPath		= tReader.GetString ();
+#endif
+    if ( uVersion>=15 )
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
@@ -4450,6 +4554,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
 	tWriter.PutDword ( tSettings.m_iNgramLen );
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+    // if turn mmseg off, the index(s) are compat again.
+    tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
 }
@@ -4724,6 +4832,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+        case TOKENIZER_ZHCN_UTF8:   pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
 			return NULL;
@@ -5963,7 +6074,20 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
 {
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
-	m_bHasBlend = false;
+
+    // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+    // Here just make CJK Charactor will remain. --coreseek
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+
+    m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+    // ENDCJK
+    m_bPreTokenized = false;    // by default use original route.
+
+    m_bHasBlend = false;
 }


@@ -5973,10 +6097,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 	// check that old one is over and that new length is sane
 	assert ( iLength>=0 );

-	// set buffer
+    // set buffer
 	m_pBuffer = sBuffer;
+    // check is pre-segment buffer, with prefix 0xFFFA
+    // if True, the following should be 0xFFA, 0x41, [ctx]      --coreseek
+    m_bPreTokenized = false;
+    if(iLength > 4)
+    {
+        // there is a ' ' (space, 32) as padding. might not true
+        unsigned char mask[] = {32, 239, 191, 186, 65};
+        unsigned char mask_bare[] = {239, 191, 186, 65};
+        if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+            // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+            m_bPreTokenized = true;
+            m_pBuffer += 5;
+        }else
+        if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+            m_bPreTokenized = true;
+            m_pBuffer += 4;
+        }
+    }
+
 	m_pBufferMax = sBuffer + iLength;
-	m_pCur = sBuffer;
+    m_pCur = m_pBuffer;
 	m_pTokenStart = m_pTokenEnd = NULL;
 	m_pBlendStart = m_pBlendEnd = NULL;

@@ -5994,7 +6137,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
 	m_bTokenBoundary = false;
 	m_bWasSynonym = false;

-	return m_bHasBlend
+    return m_bHasBlend
 		? DoGetToken<IS_QUERY,true>()
 		: DoGetToken<IS_QUERY,false>();
 }
@@ -6409,6 +6552,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	assert ( m_iNgramLen==1 );
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+        :CSphTokenizer_UTF8<IS_QUERY>()
+        , m_segoffset(0)
+{
+    //over ride charmap
+    CSphVector<CSphRemapRange> dRemaps;
+    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+
+    CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+        FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+    m_pAccumSeg = m_sAccumSeg;
+    //m_iLastTokenBufferLen = 0;
+    m_iLastTokenLenMMSeg = 0;
+
+    m_mgr = NULL;
+    m_seg = NULL;
+    m_tokenlens.Reserve(1024*512);  // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+    CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+    else
+        sphDie ( " Tokenizer initialization failure. " );
+    m_segoffset = 0;
+    m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+    m_tokenlens.Reset();
+    m_tokenpos = 0;
+    {
+    	u2 len = 0, symlen = 0;
+        while(1){
+            len = 0;
+            char* tok = (char*)seg->peekToken(len,symlen);
+            if(!tok || !*tok || !len)
+                break;
+            seg->popToken(len);
+
+            m_tokenlens.Add(len);
+            //printf("%*.*s/p ",symlen,symlen,tok);
+        }
+    }
+}
+
+template < bool IS_QUERY >
+bool	CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+    // this code might have bug, but as it will removed in next release...
+    size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+    //if(offset == 0)	return false;
+    //printf("pcur: %s\n", pCur);
+
+    //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+    {
+        u2 len = 0, symlen = 0;
+        while(m_segoffset < offset) {
+            //tok = (const char*)seg->peekToken(len, symlen);
+            //seg->popToken(len);
+            len = m_tokenlens[m_tokenpos];
+            m_tokenpos ++;
+            m_segoffset += len;
+            //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+            if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+                //break?
+                break;
+            }
+        }
+        /*
+        printf("\n");
+        printf("seg_off %d vs off %d\n", m_segoffset, offset);
+        if(m_segoffset != offset)
+        	printf("seg_pcur: %s\n", pCur);
+        */
+        return (m_segoffset == offset);
+    } //end if seg
+    return true;
+}
+
+template < bool IS_QUERY >
+BYTE *	CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+    //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+    m_iLastTokenLenMMSeg = 0;
+    //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+    while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+    {
+        BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+        //printf("utf8_token: %s \t ", tok);
+        if(!tok){
+            m_iLastTokenLenMMSeg = 0;
+            return NULL;
+        }
+
+        int token_buf_len = strlen((const char*)tok);
+
+        if(m_pAccumSeg == m_sAccumSeg)
+            m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+        if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN )  {
+            ::memcpy(m_pAccumSeg, tok, token_buf_len);
+            m_pAccumSeg += token_buf_len;
+            m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+        }
+    }
+    {
+        *m_pAccumSeg = 0;
+        //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+        m_pAccumSeg = m_sAccumSeg;
+
+        return m_sAccumSeg;
+    }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+    CSphTokenizerBase * pClone;
+    if ( eMode!=SPH_CLONE_INDEX ) {
+        pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+    }else{
+        pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+    }
+    pClone->CloneBase ( this, eMode );
+    return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+    css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+    if(seg)
+        return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+    return NULL;
+}
+
+#endif

 //////////////////////////////////////////////////////////////////////////

@@ -9658,6 +9947,7 @@ void CSphIndex::SetupQueryTokenizer()
 	// create and setup a master copy of query time tokenizer
 	// that we can then use to create lightweight clones
 	SafeDelete ( m_pQueryTokenizer );
+    m_pTokenizer->ReloadSegDictionary();
 	m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
 	if ( IsStarDict() )
 	{
@@ -25810,6 +26100,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+    m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
 	m_bIndexSP = tSettings.m_bIndexSP;
 	m_dPrefixFields = tSettings.m_dPrefixFields;
 	m_dInfixFields = tSettings.m_dInfixFields;
@@ -26414,11 +26705,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
 	{
+        //FIXME: dump token to console --coreseek
+        //debug dump
+        if(m_pTokenizer->DumpToken()) {
+            printf("%s_x ", sWord); // make the same as pre-tokenized text.
+        }
+
+        // fix sWork if in pre-tokenized mode.
+        int iBytes = strlen ( (const char*)sWord );
+        bool bAdvancePos = true;
+        if(m_pTokenizer->IsPreTokenized()) {
+            // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+            if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+                bAdvancePos = false;  // not an advance token.
+            sWord[iBytes-2] = '\0'; // change token_x   -> token\0x
+            iBytes -= 2;    // decrease length
+        }
+
 		m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );

 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );

-		if ( !bPayload )
+        if ( !bPayload && bAdvancePos)
 		{
 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
 			if ( m_pTokenizer->GetBoundary() )
@@ -26430,7 +26738,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b

 		if ( bGlobalPartialMatch )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD;
 			sBuf[iBytes+1] = '\0';
@@ -26440,7 +26748,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 		ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
 		if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
 		{
-			int iBytes = strlen ( (const char*)sWord );
+            //int iBytes = strlen ( (const char*)sWord );
 			memcpy ( sBuf + 1, sWord, iBytes );
 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
 			sBuf[iBytes+1] = '\0';
@@ -26476,6 +26784,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
 		} else
 			m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+        // works only when mmseg is on.
+        // zh_cn only GetThesaurus
+        {
+            int iBytes = strlen ( (const char*)sWord );
+            const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+            if(tbuf_ptr) {
+                while(*tbuf_ptr) {
+                    size_t len = strlen((const char*)tbuf_ptr);
+                    SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+                    if ( iWord ) {
+                        m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+                        // mmseg; do not inc step for we are in 'one' hit.
+                        //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+                    }
+                    tbuf_ptr += len + 1; //move next
+                }
+            }
+            //end if buf
+        }//end GetThesaurus
+#endif
 	}

 	m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 0e10bae0..160c1677 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
 	#define	USE_RE2			0	/// whether to compile RE2 support
 	#define USE_RLP			0	/// whether to compile RLP support
 	#define USE_WINDOWS		1	/// whether to compile for Windows
+    #define USE_MMSEG		1   /// enable mmseg
 	#define USE_SYSLOG		0	/// whether to use syslog for logging

 	#define UNALIGNED_RAM_ACCESS	1
@@ -495,7 +496,10 @@ struct CSphTokenizerSettings
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
-
+    int                 m_iDebug;           ///< is in tokenizer debug mode.
+#if USE_MMSEG
+    CSphString			m_sDictPath;        ///coreseek: where to find segmentor's dict.
+#endif
 						CSphTokenizerSettings ();
 };

@@ -606,11 +610,16 @@ class ISphTokenizer

 	/// get synonym file info
 	virtual const CSphSavedFile &	GetSynFileInfo () const { return m_tSynFileInfo; }
+    /// mark as debug tokenizer's output --coreseek -mmseg
+    virtual int					DumpToken () { return m_tSettings.m_iDebug; }

 public:
 	/// pass next buffer
 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;

+    /// is pre-tokenized            --coreseek
+    virtual bool                    IsPreTokenized()    {   return false;   }
+
 	/// set current index schema (only intended for the token filter plugins)
 	virtual bool					SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }

@@ -693,6 +702,10 @@ class ISphTokenizer
 	/// set new buffer ptr (must be within current bounds)
 	virtual void					SetBufferPtr ( const char * sNewPtr ) = 0;

+#if USE_MMSEG
+    virtual const BYTE*				GetThesaurus(BYTE * , int  ) { return NULL; }
+    virtual void                    ReloadSegDictionary()    {   return; }       // reload mmseg's dictionary.
+#endif
 	/// get settings hash
 	virtual uint64_t				GetSettingsFNV () const;

@@ -717,6 +730,9 @@ class ISphTokenizer
 	CSphLowercaser					m_tLC;						///< my lowercaser
 	int								m_iLastTokenLen;			///< last token length, in codepoints
 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+    int								m_iLastTokenBufferLen;		///< the buffer length -- coreseek;	use in mmseg patch.
+#endif
 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
 	int								m_iBoundaryOffset;			///< boundary character offset (in bytes)
 	bool							m_bWasSpecial;				///< special token flag
@@ -1822,6 +1838,7 @@ struct CSphSourceSettings
 	int		m_iStopwordStep;	///< position step on stopword token (default is 1)
 	bool	m_bIndexSP;			///< whether to index sentence and paragraph delimiters
 	bool	m_bIndexFieldLens;	///< whether to index field lengths
+    int		m_bDebugDump;		///< mmseg charset debug output feature

 	CSphVector<CSphString>	m_dPrefixFields;	///< list of prefix fields
 	CSphVector<CSphString>	m_dInfixFields;		///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index f02d2bbf..a8ddbc03 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -407,6 +407,8 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "min_word_len",			0, NULL },
 	{ "charset_type",			KEY_REMOVED, NULL },
 	{ "charset_table",			0, NULL },
+    { "charset_dictpath",		0, NULL }, //coreseek: mmseg's dictionary path
+    { "charset_debug",			0, NULL }, //coreseek: debug output tokens
 	{ "ignore_chars",			0, NULL },
 	{ "min_prefix_len",			0, NULL },
 	{ "min_infix_len",			0, NULL },
@@ -1133,7 +1135,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 {
 	tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );

-	if ( hIndex ( "ngram_chars" ) )
+    if(hIndex("charset_debug"))
+        tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+    if ( hIndex ( "ngram_chars" ) )
 	{
 		if ( tSettings.m_iNgramLen )
 			tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1141,6 +1146,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}

+#if USE_MMSEG
+    //XXX:fixme : sphinx changes tokenizer create process
+    if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+    {
+        tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+        tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+    }
+#endif
+
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1274,6 +1288,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
 	tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
 	tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
 	tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+    tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;

 	// prefix/infix fields
 	CSphString sFields;
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 5b433f2d..047997df 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -146,6 +146,7 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+    , TOKENIZER_ZHCN_UTF8 = 4
 };

 /// load config file
From cf043274c0b5ca3700b50ecd14c500d7570800d1 Mon Sep 17 00:00:00 2001
From: fffonion <fffonion@gmail.com>
Date: Tue, 15 Mar 2016 00:05:47 -0400
Subject: [PATCH 1/2] add hiragana and katagana into dRemaps

---
 src/sphinx.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 70dceaf6..6b4c3159 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6077,10 +6077,14 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()

     // BEGIN CJK There is no case folding, should do this in remote tokenizer.
     // Here just make CJK Charactor will remain. --coreseek
+    // 4e00 - 9fff CJK unified ideographs
+    // 3000 - 303f CJK symbols and punctuation
+    // 3040 - 30ff Hiragana/Katagana
+    // ff00 - ffff half/fullwidth forms
     CSphVector<CSphRemapRange> dRemaps;
-    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
     dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );

     m_tLC.AddRemaps ( dRemaps,
         FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
@@ -6562,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
 {
     //over ride charmap
     CSphVector<CSphRemapRange> dRemaps;
-    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
     dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );

     CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
         FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1

From f1e1b00d312bc267d9baaa472daebee831cd298a Mon Sep 17 00:00:00 2001
From: fffonion <fffonion@gmail.com>
Date: Mon, 28 Mar 2016 00:47:19 -0400
Subject: [PATCH 2/2] remove symbols and punctuation

---
 src/sphinx.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 6b4c3159..ec9d7902 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6082,9 +6082,9 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
     // 3040 - 30ff Hiragana/Katagana
     // ff00 - ffff half/fullwidth forms
     CSphVector<CSphRemapRange> dRemaps;
-    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );

     m_tLC.AddRemaps ( dRemaps,
         FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
@@ -6566,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
 {
     //over ride charmap
     CSphVector<CSphRemapRange> dRemaps;
-    dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
-    dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+    dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+    dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );

     CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
         FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
From 68b1d8e74cdf602734d0540820429955a24ad322 Mon Sep 17 00:00:00 2001
From: nzinfo <limn@coreseek.com>
Date: Tue, 11 Aug 2015 22:01:16 +0800
Subject: [PATCH] add branding

---
 .gitignore          |  8 ++++++++
 src/indexer.cpp     |  6 +++---
 src/searchd.cpp     | 14 +++++++-------
 src/sphinx.h        |  6 +++++-
 src/sphinxutils.cpp |  8 ++++----
 5 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 66aad130..e11b3285 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,11 @@
 /autom4te.cache/
 /config/ar-lib
 /config/compile
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
+
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 6bb1d05c..c2cee31d 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1766,7 +1766,7 @@ int main ( int argc, char ** argv )
 				"\n"
 				"Options are:\n"
 				"--config <file>\t\tread configuration from specified file\n"
-				"\t\t\t(default is sphinx.conf)\n"
+                "\t\t\t(default is csft.conf)\n"
 				"--all\t\t\treindex all configured indexes\n"
 				"--quiet\t\t\tbe quiet, only print errors\n"
 				"--verbose\t\tverbose indexing issues report\n"
@@ -1795,8 +1795,8 @@ int main ( int argc, char ** argv )
 				"--keep-attrs\t\tretain attributes from the old index"
 				"\n"
 				"Examples:\n"
-				"indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
-				"indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+                "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+                "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
 		}

 		return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index d961eb61..2e647bc6 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -21583,7 +21583,7 @@ void ShowHelp ()
 		"Options are:\n"
 		"-h, --help\t\tdisplay this help message\n"
 		"-c, --config <file>\tread configuration from specified file\n"
-		"\t\t\t(default is sphinx.conf)\n"
+        "\t\t\t(default is csft.conf)\n"
 		"--stop\t\t\tsend SIGTERM to currently running searchd\n"
 		"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
 		"--status\t\tget ant print status variables\n"
@@ -21620,9 +21620,9 @@ void ShowHelp ()
 		"--safetrace\t\tonly use system backtrace() call in crash reports\n"
 		"\n"
 		"Examples:\n"
-		"searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+        "searchd --config /usr/local/sphinx/etc/csft.conf\n"
 #if USE_WINDOWS
-		"searchd --install --config c:\\sphinx\\sphinx.conf\n"
+        "searchd --install --config c:\\sphinx\\csft.conf\n"
 #endif
 		);
 }
@@ -22888,12 +22888,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	while ( !g_sConfigFile.cstr() )
 	{
 #ifdef SYSCONFDIR
-		g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+        g_sConfigFile = SYSCONFDIR "/";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;
 #endif

-		g_sConfigFile = "./sphinx.conf";
+        g_sConfigFile = "./";
 		if ( sphIsReadable ( g_sConfigFile.cstr () ) )
 			break;

@@ -22904,9 +22904,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
 	if ( !g_sConfigFile.cstr () )
 		sphFatal ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-			SYSCONFDIR "/sphinx.conf, "
+            SYSCONFDIR "/csft.conf, "
 #endif
-			"./sphinx.conf)." );
+            "./csft.conf)." );

 	sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );

diff --git a/src/sphinx.h b/src/sphinx.h
index 99a98de1..0e10bae0 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -200,7 +200,7 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #endif

 #define SPHINX_VERSION			"2.2.11" SPHINX_BITS_TAG SPHINX_TAG " (" SPH_SVN_TAGREV ")"
-#define SPHINX_BANNER			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG		"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
 #define SPHINX_SEARCHD_PROTO	1
 #define SPHINX_CLIENT_VERSION	1

@@ -208,6 +208,10 @@ inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOC
 #define SPH_MAX_FILENAME_LEN	512
 #define SPH_MAX_FIELDS			256

+#define CORESEEK_BANNER			"Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
 /////////////////////////////////////////////////////////////////////////////

 extern int64_t g_iIndexerCurrentDocID;
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 966c64af..f02d2bbf 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -1574,12 +1574,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	while ( !sOptConfig )
 	{
 #ifdef SYSCONFDIR
-		sOptConfig = SYSCONFDIR "/sphinx.conf";
+        sOptConfig = SYSCONFDIR "/csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;
 #endif

-		sOptConfig = "./sphinx.conf";
+        sOptConfig = "./csft.conf";
 		if ( sphIsReadable ( sOptConfig ) )
 			break;

@@ -1590,9 +1590,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
 	if ( !sOptConfig )
 		sphDie ( "no readable config file (looked in "
 #ifdef SYSCONFDIR
-		SYSCONFDIR "/sphinx.conf, "
+        SYSCONFDIR "/csft.conf, "
 #endif
-		"./sphinx.conf)" );
+        "./csft.conf)" );

 	if ( !bQuiet )
 		fprintf ( stdout, "using config file '%s'...\n", sOptConfig );