Created
September 21, 2017 21:44
-
-
Save fffonion/5b2b561efdde56abcfa61102f67385bb to your computer and use it in GitHub Desktop.
Coreseek 5.1 patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From c7db4c68750b17532e78da313c5b5d25a1d00a3e Mon Sep 17 00:00:00 2001 | |
From: nzinfo <limn@coreseek.com> | |
Date: Thu, 13 Aug 2015 23:16:20 +0800 | |
Subject: [PATCH] add mmseg support, begin fix pre-token | |
--- | |
acinclude.m4 | 89 ++++++++++++++ | |
configure.ac | 21 +++- | |
src/Makefile.am | 4 + | |
src/sphinx.cpp | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++-- | |
src/sphinx.h | 19 ++- | |
src/sphinxutils.cpp | 17 ++- | |
src/sphinxutils.h | 1 + | |
7 files changed, 486 insertions(+), 12 deletions(-) | |
diff --git a/acinclude.m4 b/acinclude.m4 | |
index e09697ea..3ae78b01 100644 | |
--- a/acinclude.m4 | |
+++ b/acinclude.m4 | |
@@ -298,6 +298,95 @@ fi | |
]) | |
dnl --------------------------------------------------------------------------- | |
+dnl Macro: AC_CHECK_MMSEG | |
+dnl --------------------------------------------------------------------------- | |
+ | |
+AC_DEFUN([AC_CHECK_MMSEG],[ | |
+ | |
+if test [ -z "$MMSEG_CFLAGS" ] | |
+then | |
+ for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg" | |
+ do | |
+ if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ] | |
+ then | |
+ MMSEG_CFLAGS="-I$CANDIDATE" | |
+ break | |
+ fi | |
+ done | |
+fi | |
+ | |
+# explicit overrides will be applied later | |
+if test [ -z "$MMSEG_LIBS" ] | |
+then | |
+ for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \ | |
+ "/usr/local/lib" "/usr/local/mmseg/lib" \ | |
+ "/usr/local/lib/mmseg" "/usr/lib" \ | |
+ "/opt/mmseg/lib" | |
+ do | |
+ if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ] | |
+ then | |
+ MMSEG_LIBS="-L$CANDIDATE -lmmseg" | |
+ break | |
+ fi | |
+ done | |
+fi | |
+ | |
+# apply explicit include path overrides | |
+AC_ARG_WITH([mmseg-includes], | |
+ AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]), | |
+ [ac_cv_mmseg_includes=$withval]) | |
+if test [ -n "$ac_cv_mmseg_includes" ] | |
+then | |
+ MMSEG_CFLAGS="-I$ac_cv_mmseg_includes" | |
+fi | |
+ | |
+ | |
+# apply explicit lib path overrides | |
+AC_ARG_WITH([mmseg-libs], | |
+ AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]), | |
+ [ac_cv_mmseg_libs=$withval]) | |
+if test [ -n "$ac_cv_mmseg_libs" ] | |
+then | |
+ # Trim trailing '.libs' if user passed it in --with-mysql-libs option | |
+ ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \ | |
+ -e 's+.libs/$++'` | |
+ MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg" | |
+fi | |
+ | |
+# now that we did all we could, perform final checks | |
+AC_MSG_CHECKING([libmmseg include files]) | |
+if test [ -z "$MMSEG_CFLAGS" ] | |
+then | |
+ AC_MSG_ERROR([missing include files. | |
+ | |
+****************************************************************************** | |
+ERROR: cannot find libmmseg include files. | |
+ | |
+To disable libmmseg support, use --without-mmseg option. | |
+****************************************************************************** | |
+]) | |
+else | |
+ AC_MSG_RESULT([$MMSEG_CFLAGS]) | |
+fi | |
+ | |
+AC_MSG_CHECKING([libmmseg libraries]) | |
+if test [ -z "$MMSEG_LIBS" ] | |
+then | |
+ AC_MSG_ERROR([missing libraries. | |
+ | |
+****************************************************************************** | |
+ERROR: cannot find libmmseg libraries. | |
+ | |
+To disable libmmseg support, use --without-mmseg option. | |
++****************************************************************************** | |
+]) | |
+else | |
+ AC_MSG_RESULT([$MMSEG_LIBS]) | |
+fi | |
+ | |
+]) | |
+ | |
+dnl --------------------------------------------------------------------------- | |
dnl Macro: AC_CHECK_LIBSTEMMER | |
dnl Check the libstemmer first in custom include path in --with-libstemmer=* | |
dnl If not given, try to guess common shared libs, and finally fall back into | |
diff --git a/configure.ac b/configure.ac | |
index d56fbd95..e08dc886 100644 | |
--- a/configure.ac | |
+++ b/configure.ac | |
@@ -69,6 +69,7 @@ fi | |
AC_PROG_CC | |
AC_PROG_CXX | |
+AM_PROG_AR | |
AC_PROG_RANLIB | |
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ | |
@@ -429,6 +430,24 @@ else | |
fi | |
AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno ) | |
+dnl --- | |
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support | |
+AC_ARG_WITH([mmseg], | |
+ AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]), | |
+ [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes] | |
+) | |
+AC_MSG_CHECKING([whether to compile with libmmseg support]) | |
+if test x$ac_cv_use_mmseg != xno; then | |
+ AC_MSG_RESULT([yes]) | |
+ AC_CHECK_MMSEG([$ac_cv_use_mmseg]) | |
+ AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support]) | |
+ AC_SUBST([MMSEG_LIBS]) | |
+ AC_SUBST([MMSEG_CFLAGS]) | |
+else | |
+ AC_MSG_RESULT([no]) | |
+fi | |
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno) | |
+ | |
# add macports include directory | |
if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then | |
MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include" | |
@@ -483,7 +502,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer != | |
dnl --- | |
# we can now set preprocessor flags for both C and C++ compilers | |
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS" | |
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS" | |
AC_ARG_WITH([libexpat], | |
diff --git a/src/Makefile.am b/src/Makefile.am | |
index 3129f594..8e696075 100644 | |
--- a/src/Makefile.am | |
+++ b/src/Makefile.am | |
@@ -30,5 +30,9 @@ RLP_INC = | |
endif | |
AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" | |
+if USE_MMSEG | |
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS) | |
+else | |
COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) | |
+endif | |
LDADD = $(COMMON_LIBS) | |
diff --git a/src/sphinx.cpp b/src/sphinx.cpp | |
index 6c4a4097..70dceaf6 100644 | |
--- a/src/sphinx.cpp | |
+++ b/src/sphinx.cpp | |
@@ -141,6 +141,16 @@ | |
#pragma message("Automatically linking with btutils.lib") | |
#endif | |
+#if ( USE_WINDOWS && USE_MMSEG ) | |
+ #if _DEBUG | |
+ #pragma comment(linker, "/defaultlib:libcss_d.lib") | |
+ #else | |
+ #pragma comment(linker, "/defaultlib:libcss.lib") | |
+ #endif | |
+ #pragma message("Automatically linking with libcss.lib") | |
+ #pragma warning(disable:4530) // for ugly mmseg | |
+#endif | |
+ | |
///////////////////////////////////////////////////////////////////////////// | |
// logf() is not there sometimes (eg. Solaris 9) | |
@@ -2552,10 +2562,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2 | |
public: | |
CSphTokenizer_UTF8 (); | |
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); | |
+ virtual bool IsPreTokenized() { return m_bPreTokenized; } | |
virtual BYTE * GetToken (); | |
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; | |
virtual int GetCodepointLength ( int iCode ) const; | |
virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } | |
+ | |
+protected: | |
+ bool m_bPreTokenized; | |
}; | |
@@ -2576,6 +2590,78 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY> | |
CSphString m_sNgramCharsStr; | |
}; | |
+#if USE_MMSEG | |
+ | |
+#include "SegmenterManager.h" | |
+#include "Segmenter.h" | |
+ | |
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance; | |
+ | |
+template < bool IS_QUERY > | |
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY> | |
+{ | |
+public: | |
+ CSphTokenizer_UTF8MMSeg (); | |
+ ~CSphTokenizer_UTF8MMSeg() { | |
+ if(m_seg){ | |
+ SafeDelete ( m_seg ); | |
+ } | |
+ } | |
+ | |
+ virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); | |
+ virtual BYTE * GetToken (); | |
+ virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; | |
+ virtual const BYTE* GetThesaurus(BYTE * sBuffer, int iLength ); | |
+ bool IsSegment(const BYTE * pCur); | |
+ | |
+ CSphTokenizerBase* SetDictPath(const char* path) { m_dictpath = path; return this; } | |
+ | |
+ virtual const char * GetBufferPtr () const { return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; } | |
+ | |
+ virtual const char * GetTokenStart () const { return m_segToken; } | |
+ | |
+ virtual int GetLastTokenLen () const { return m_iLastTokenLenMMSeg; } | |
+ | |
+ virtual void ReloadSegDictionary() { | |
+ if(m_seg){ | |
+ SafeDelete ( m_seg ); | |
+ } | |
+ m_seg = NULL; | |
+ | |
+ if(m_mgr) { | |
+ SegmenterManagerSingleInstance::Free(); // free preexist instance. | |
+ m_mgr = NULL; | |
+ } | |
+ } | |
+protected: | |
+ char* m_segToken; | |
+ size_t m_segoffset; | |
+ int m_iLastTokenLenMMSeg; | |
+ BYTE m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator | |
+ BYTE * m_pAccumSeg; ///< current accumulator position | |
+ CSphVector<u2> m_tokenlens; | |
+ int m_tokenpos; | |
+protected: | |
+ // virtual bool IsSegment(const BYTE * pCur); | |
+ CSphString m_dictpath; | |
+ | |
+ // mmseg related | |
+ css::Segmenter* m_seg; | |
+ css::SegmenterManager* m_mgr; | |
+ css::Segmenter* GetSegmenter(const char* dict_path){ | |
+ int nRet = 0; | |
+ if(!m_mgr) { | |
+ m_mgr = SegmenterManagerSingleInstance::Get(); | |
+ if(dict_path) | |
+ nRet = m_mgr->init(dict_path); | |
+ } | |
+ if(nRet == 0 && !m_seg) | |
+ m_seg = m_mgr->getSegmenter(false); | |
+ return m_seg; | |
+ } | |
+}; | |
+ | |
+#endif | |
struct CSphNormalForm | |
{ | |
@@ -3795,6 +3881,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer () | |
return new CSphTokenizer_UTF8Ngram<false> (); | |
} | |
+#if USE_MMSEG | |
+ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path ) | |
+{ | |
+ CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> (); | |
+ tokenizer->SetDictPath(dict_path); | |
+ return tokenizer; | |
+} | |
+#endif | |
+ | |
///////////////////////////////////////////////////////////////////////////// | |
enum | |
@@ -4380,6 +4475,7 @@ CSphTokenizerSettings::CSphTokenizerSettings () | |
: m_iType ( TOKENIZER_UTF8 ) | |
, m_iMinWordLen ( 1 ) | |
, m_iNgramLen ( 0 ) | |
+ , m_iDebug ( 0 ) | |
{ | |
} | |
@@ -4391,7 +4487,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett | |
return true; | |
tSettings.m_iType = tReader.GetByte (); | |
- if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM ) | |
+#if USE_MMSEG | |
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8) | |
+#else | |
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM) | |
+#endif | |
{ | |
sWarning = "can't load an old index with SBCS tokenizer"; | |
return false; | |
@@ -4419,7 +4519,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett | |
tSettings.m_sIgnoreChars = tReader.GetString (); | |
tSettings.m_iNgramLen = tReader.GetDword (); | |
tSettings.m_sNgramChars = tReader.GetString (); | |
- if ( uVersion>=15 ) | |
+#if USE_MMSEG | |
+ //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same. | |
+ tSettings.m_sDictPath = tReader.GetString (); | |
+#endif | |
+ if ( uVersion>=15 ) | |
tSettings.m_sBlendChars = tReader.GetString (); | |
if ( uVersion>=24 ) | |
tSettings.m_sBlendMode = tReader.GetString(); | |
@@ -4450,6 +4554,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i | |
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () ); | |
tWriter.PutDword ( tSettings.m_iNgramLen ); | |
tWriter.PutString ( tSettings.m_sNgramChars.cstr () ); | |
+#if USE_MMSEG | |
+ // if turn mmseg off, the index(s) are compat again. | |
+ tWriter.PutString ( tSettings.m_sDictPath.cstr () ); | |
+#endif | |
tWriter.PutString ( tSettings.m_sBlendChars.cstr () ); | |
tWriter.PutString ( tSettings.m_sBlendMode.cstr () ); | |
} | |
@@ -4724,6 +4832,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, | |
{ | |
case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break; | |
case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break; | |
+#if USE_MMSEG | |
+ case TOKENIZER_ZHCN_UTF8: pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break; | |
+#endif | |
default: | |
sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType ); | |
return NULL; | |
@@ -5963,7 +6074,20 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 () | |
{ | |
CSphString sTmp; | |
SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); | |
- m_bHasBlend = false; | |
+ | |
+ // BEGIN CJK There is no case folding, should do this in remote tokenizer. | |
+ // Here just make CJK Charactor will remain. --coreseek | |
+ CSphVector<CSphRemapRange> dRemaps; | |
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) ); | |
+ | |
+ m_tLC.AddRemaps ( dRemaps, | |
+ FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1 | |
+ // ENDCJK | |
+ m_bPreTokenized = false; // by default use original route. | |
+ | |
+ m_bHasBlend = false; | |
} | |
@@ -5973,10 +6097,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength | |
// check that old one is over and that new length is sane | |
assert ( iLength>=0 ); | |
- // set buffer | |
+ // set buffer | |
m_pBuffer = sBuffer; | |
+ // check is pre-segment buffer, with prefix 0xFFFA | |
+ // if True, the following should be 0xFFA, 0x41, [ctx] --coreseek | |
+ m_bPreTokenized = false; | |
+ if(iLength > 4) | |
+ { | |
+ // there is a ' ' (space, 32) as padding. might not true | |
+ unsigned char mask[] = {32, 239, 191, 186, 65}; | |
+ unsigned char mask_bare[] = {239, 191, 186, 65}; | |
+ if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) { | |
+ // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized. | |
+ m_bPreTokenized = true; | |
+ m_pBuffer += 5; | |
+ }else | |
+ if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) { | |
+ m_bPreTokenized = true; | |
+ m_pBuffer += 4; | |
+ } | |
+ } | |
+ | |
m_pBufferMax = sBuffer + iLength; | |
- m_pCur = sBuffer; | |
+ m_pCur = m_pBuffer; | |
m_pTokenStart = m_pTokenEnd = NULL; | |
m_pBlendStart = m_pBlendEnd = NULL; | |
@@ -5994,7 +6137,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken () | |
m_bTokenBoundary = false; | |
m_bWasSynonym = false; | |
- return m_bHasBlend | |
+ return m_bHasBlend | |
? DoGetToken<IS_QUERY,true>() | |
: DoGetToken<IS_QUERY,false>(); | |
} | |
@@ -6409,6 +6552,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken () | |
assert ( m_iNgramLen==1 ); | |
return CSphTokenizer_UTF8<IS_QUERY>::GetToken (); | |
} | |
+////////////////////////////////////////////////////////////////////////// | |
+#if USE_MMSEG | |
+////////////////////////////////////////////////////////////////////////// | |
+template < bool IS_QUERY > | |
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg () | |
+ :CSphTokenizer_UTF8<IS_QUERY>() | |
+ , m_segoffset(0) | |
+{ | |
+ //over ride charmap | |
+ CSphVector<CSphRemapRange> dRemaps; | |
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) ); | |
+ | |
+ CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps, | |
+ FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1 | |
+ m_pAccumSeg = m_sAccumSeg; | |
+ //m_iLastTokenBufferLen = 0; | |
+ m_iLastTokenLenMMSeg = 0; | |
+ | |
+ m_mgr = NULL; | |
+ m_seg = NULL; | |
+ m_tokenlens.Reserve(1024*512); // resize to 512K | |
+} | |
+ | |
+template < bool IS_QUERY > | |
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength ) | |
+{ | |
+ CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength); | |
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); | |
+ if(seg) | |
+ seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength); | |
+ else | |
+ sphDie ( " Tokenizer initialization failure. " ); | |
+ m_segoffset = 0; | |
+ m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; | |
+ | |
+ m_tokenlens.Reset(); | |
+ m_tokenpos = 0; | |
+ { | |
+ u2 len = 0, symlen = 0; | |
+ while(1){ | |
+ len = 0; | |
+ char* tok = (char*)seg->peekToken(len,symlen); | |
+ if(!tok || !*tok || !len) | |
+ break; | |
+ seg->popToken(len); | |
+ | |
+ m_tokenlens.Add(len); | |
+ //printf("%*.*s/p ",symlen,symlen,tok); | |
+ } | |
+ } | |
+} | |
+ | |
+template < bool IS_QUERY > | |
+bool CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur) | |
+{ | |
+ // this code might have bug, but as it will removed in next release... | |
+ size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer; | |
+ //if(offset == 0) return false; | |
+ //printf("pcur: %s\n", pCur); | |
+ | |
+ //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here | |
+ { | |
+ u2 len = 0, symlen = 0; | |
+ while(m_segoffset < offset) { | |
+ //tok = (const char*)seg->peekToken(len, symlen); | |
+ //seg->popToken(len); | |
+ len = m_tokenlens[m_tokenpos]; | |
+ m_tokenpos ++; | |
+ m_segoffset += len; | |
+ //printf("tok: %*.*s, len=%d\t ",len,len,tok, len); | |
+ if(m_tokenpos >= m_tokenlens.GetLength() || len==0){ | |
+ //break? | |
+ break; | |
+ } | |
+ } | |
+ /* | |
+ printf("\n"); | |
+ printf("seg_off %d vs off %d\n", m_segoffset, offset); | |
+ if(m_segoffset != offset) | |
+ printf("seg_pcur: %s\n", pCur); | |
+ */ | |
+ return (m_segoffset == offset); | |
+ } //end if seg | |
+ return true; | |
+} | |
+ | |
+template < bool IS_QUERY > | |
+BYTE * CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken () | |
+{ | |
+ //return CSphTokenizer_UTF8<IS_QUERY>::GetToken(); | |
+ m_iLastTokenLenMMSeg = 0; | |
+ //BYTE* tok = CSphTokenizer_UTF8::GetToken(); | |
+ while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg) | |
+ { | |
+ BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken(); | |
+ //printf("utf8_token: %s \t ", tok); | |
+ if(!tok){ | |
+ m_iLastTokenLenMMSeg = 0; | |
+ return NULL; | |
+ } | |
+ | |
+ int token_buf_len = strlen((const char*)tok); | |
+ | |
+ if(m_pAccumSeg == m_sAccumSeg) | |
+ m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart; | |
+ | |
+ if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN ) { | |
+ ::memcpy(m_pAccumSeg, tok, token_buf_len); | |
+ m_pAccumSeg += token_buf_len; | |
+ m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen(); | |
+ } | |
+ } | |
+ { | |
+ *m_pAccumSeg = 0; | |
+ //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg; | |
+ m_pAccumSeg = m_sAccumSeg; | |
+ | |
+ return m_sAccumSeg; | |
+ } | |
+} | |
+ | |
+template < bool IS_QUERY > | |
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const | |
+{ | |
+ CSphTokenizerBase * pClone; | |
+ if ( eMode!=SPH_CLONE_INDEX ) { | |
+ pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr()); | |
+ }else{ | |
+ pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr()); | |
+ } | |
+ pClone->CloneBase ( this, eMode ); | |
+ return pClone; | |
+} | |
+ | |
+template < bool IS_QUERY > | |
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength ) | |
+{ | |
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); | |
+ if(seg) | |
+ return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength); | |
+ return NULL; | |
+} | |
+ | |
+#endif | |
////////////////////////////////////////////////////////////////////////// | |
@@ -9658,6 +9947,7 @@ void CSphIndex::SetupQueryTokenizer() | |
// create and setup a master copy of query time tokenizer | |
// that we can then use to create lightweight clones | |
SafeDelete ( m_pQueryTokenizer ); | |
+ m_pTokenizer->ReloadSegDictionary(); | |
m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY ); | |
if ( IsStarDict() ) | |
{ | |
@@ -25810,6 +26100,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings ) | |
m_bIndexExactWords = tSettings.m_bIndexExactWords; | |
m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 ); | |
m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 ); | |
+ m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting | |
m_bIndexSP = tSettings.m_bIndexSP; | |
m_dPrefixFields = tSettings.m_dPrefixFields; | |
m_dInfixFields = tSettings.m_dInfixFields; | |
@@ -26414,11 +26705,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b | |
while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits ) | |
&& ( sWord = m_pTokenizer->GetToken() )!=NULL ) | |
{ | |
+ //FIXME: dump token to console --coreseek | |
+ //debug dump | |
+ if(m_pTokenizer->DumpToken()) { | |
+ printf("%s_x ", sWord); // make the same as pre-tokenized text. | |
+ } | |
+ | |
+ // fix sWork if in pre-tokenized mode. | |
+ int iBytes = strlen ( (const char*)sWord ); | |
+ bool bAdvancePos = true; | |
+ if(m_pTokenizer->IsPreTokenized()) { | |
+ // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning. | |
+ if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos) | |
+ bAdvancePos = false; // not an advance token. | |
+ sWord[iBytes-2] = '\0'; // change token_x -> token\0x | |
+ iBytes -= 2; // decrease length | |
+ } | |
+ | |
m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() ); | |
int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() ); | |
- if ( !bPayload ) | |
+ if ( !bPayload && bAdvancePos) | |
{ | |
HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep ); | |
if ( m_pTokenizer->GetBoundary() ) | |
@@ -26430,7 +26738,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b | |
if ( bGlobalPartialMatch ) | |
{ | |
- int iBytes = strlen ( (const char*)sWord ); | |
+ //int iBytes = strlen ( (const char*)sWord ); | |
memcpy ( sBuf + 1, sWord, iBytes ); | |
sBuf[0] = MAGIC_WORD_HEAD; | |
sBuf[iBytes+1] = '\0'; | |
@@ -26440,7 +26748,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b | |
ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph(); | |
if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS ) | |
{ | |
- int iBytes = strlen ( (const char*)sWord ); | |
+ //int iBytes = strlen ( (const char*)sWord ); | |
memcpy ( sBuf + 1, sWord, iBytes ); | |
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED; | |
sBuf[iBytes+1] = '\0'; | |
@@ -26476,6 +26784,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b | |
m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos ); | |
} else | |
m_tState.m_iBuildLastStep = m_iStopwordStep; | |
+#if USE_MMSEG | |
+ // works only when mmseg is on. | |
+ // zh_cn only GetThesaurus | |
+ { | |
+ int iBytes = strlen ( (const char*)sWord ); | |
+ const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes); | |
+ if(tbuf_ptr) { | |
+ while(*tbuf_ptr) { | |
+ size_t len = strlen((const char*)tbuf_ptr); | |
+ SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true); | |
+ if ( iWord ) { | |
+ m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos ); | |
+ // mmseg; do not inc step for we are in 'one' hit. | |
+ //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1; | |
+ } | |
+ tbuf_ptr += len + 1; //move next | |
+ } | |
+ } | |
+ //end if buf | |
+ }//end GetThesaurus | |
+#endif | |
} | |
m_tState.m_bProcessingHits = ( sWord!=NULL ); | |
diff --git a/src/sphinx.h b/src/sphinx.h | |
index 0e10bae0..160c1677 100644 | |
--- a/src/sphinx.h | |
+++ b/src/sphinx.h | |
@@ -28,6 +28,7 @@ | |
#define USE_RE2 0 /// whether to compile RE2 support | |
#define USE_RLP 0 /// whether to compile RLP support | |
#define USE_WINDOWS 1 /// whether to compile for Windows | |
+ #define USE_MMSEG 1 /// enable mmseg | |
#define USE_SYSLOG 0 /// whether to use syslog for logging | |
#define UNALIGNED_RAM_ACCESS 1 | |
@@ -495,7 +496,10 @@ struct CSphTokenizerSettings | |
CSphString m_sBlendChars; | |
CSphString m_sBlendMode; | |
CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output | |
- | |
+ int m_iDebug; ///< is in tokenizer debug mode. | |
+#if USE_MMSEG | |
+ CSphString m_sDictPath; ///coreseek: where to find segmentor's dict. | |
+#endif | |
CSphTokenizerSettings (); | |
}; | |
@@ -606,11 +610,16 @@ class ISphTokenizer | |
/// get synonym file info | |
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; } | |
+ /// mark as debug tokenizer's output --coreseek -mmseg | |
+ virtual int DumpToken () { return m_tSettings.m_iDebug; } | |
public: | |
/// pass next buffer | |
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0; | |
+ /// is pre-tokenized --coreseek | |
+ virtual bool IsPreTokenized() { return false; } | |
+ | |
/// set current index schema (only intended for the token filter plugins) | |
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; } | |
@@ -693,6 +702,10 @@ class ISphTokenizer | |
/// set new buffer ptr (must be within current bounds) | |
virtual void SetBufferPtr ( const char * sNewPtr ) = 0; | |
+#if USE_MMSEG | |
+ virtual const BYTE* GetThesaurus(BYTE * , int ) { return NULL; } | |
+ virtual void ReloadSegDictionary() { return; } // reload mmseg's dictionary. | |
+#endif | |
/// get settings hash | |
virtual uint64_t GetSettingsFNV () const; | |
@@ -717,6 +730,9 @@ class ISphTokenizer | |
CSphLowercaser m_tLC; ///< my lowercaser | |
int m_iLastTokenLen; ///< last token length, in codepoints | |
bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator) | |
+#if USE_MMSEG | |
+ int m_iLastTokenBufferLen; ///< the buffer length -- coreseek; use in mmseg patch. | |
+#endif | |
bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint) | |
int m_iBoundaryOffset; ///< boundary character offset (in bytes) | |
bool m_bWasSpecial; ///< special token flag | |
@@ -1822,6 +1838,7 @@ struct CSphSourceSettings | |
int m_iStopwordStep; ///< position step on stopword token (default is 1) | |
bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters | |
bool m_bIndexFieldLens; ///< whether to index field lengths | |
+ int m_bDebugDump; ///< mmseg charset debug output feature | |
CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields | |
CSphVector<CSphString> m_dInfixFields; ///< list of infix fields | |
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp | |
index f02d2bbf..a8ddbc03 100644 | |
--- a/src/sphinxutils.cpp | |
+++ b/src/sphinxutils.cpp | |
@@ -407,6 +407,8 @@ static KeyDesc_t g_dKeysIndex[] = | |
{ "min_word_len", 0, NULL }, | |
{ "charset_type", KEY_REMOVED, NULL }, | |
{ "charset_table", 0, NULL }, | |
+ { "charset_dictpath", 0, NULL }, //coreseek: mmseg's dictionary path | |
+ { "charset_debug", 0, NULL }, //coreseek: debug output tokens | |
{ "ignore_chars", 0, NULL }, | |
{ "min_prefix_len", 0, NULL }, | |
{ "min_infix_len", 0, NULL }, | |
@@ -1133,7 +1135,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings | |
{ | |
tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 ); | |
- if ( hIndex ( "ngram_chars" ) ) | |
+ if(hIndex("charset_debug")) | |
+ tSettings.m_iDebug = hIndex["charset_debug"].intval(); | |
+ | |
+ if ( hIndex ( "ngram_chars" ) ) | |
{ | |
if ( tSettings.m_iNgramLen ) | |
tSettings.m_iType = TOKENIZER_NGRAM; | |
@@ -1141,6 +1146,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings | |
sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" ); | |
} | |
+#if USE_MMSEG | |
+ //XXX:fixme : sphinx changes tokenizer create process | |
+ if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" ) | |
+ { | |
+ tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath"); | |
+ tSettings.m_iType = TOKENIZER_ZHCN_UTF8; | |
+ } | |
+#endif | |
+ | |
tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" ); | |
tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 ); | |
tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" ); | |
@@ -1274,6 +1288,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti | |
tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 ); | |
tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0; | |
tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" ); | |
+ tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0; | |
// prefix/infix fields | |
CSphString sFields; | |
diff --git a/src/sphinxutils.h b/src/sphinxutils.h | |
index 5b433f2d..047997df 100644 | |
--- a/src/sphinxutils.h | |
+++ b/src/sphinxutils.h | |
@@ -146,6 +146,7 @@ enum | |
// where was TOKENIZER_SBCS=1 once | |
TOKENIZER_UTF8 = 2, | |
TOKENIZER_NGRAM = 3 | |
+ , TOKENIZER_ZHCN_UTF8 = 4 | |
}; | |
/// load config file | |
From cf043274c0b5ca3700b50ecd14c500d7570800d1 Mon Sep 17 00:00:00 2001 | |
From: fffonion <fffonion@gmail.com> | |
Date: Tue, 15 Mar 2016 00:05:47 -0400 | |
Subject: [PATCH 1/2] add hiragana and katagana into dRemaps | |
--- | |
src/sphinx.cpp | 12 ++++++++---- | |
1 file changed, 8 insertions(+), 4 deletions(-) | |
diff --git a/src/sphinx.cpp b/src/sphinx.cpp | |
index 70dceaf6..6b4c3159 100644 | |
--- a/src/sphinx.cpp | |
+++ b/src/sphinx.cpp | |
@@ -6077,10 +6077,14 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 () | |
// BEGIN CJK There is no case folding, should do this in remote tokenizer. | |
// Here just make CJK Charactor will remain. --coreseek | |
+ // 4e00 - 9fff CJK unified ideographs | |
+ // 3000 - 303f CJK symbols and punctuation | |
+ // 3040 - 30ff Hiragana/Katagana | |
+ // ff00 - ffff half/fullwidth forms | |
CSphVector<CSphRemapRange> dRemaps; | |
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) ); | |
dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) ); | |
m_tLC.AddRemaps ( dRemaps, | |
FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1 | |
@@ -6562,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg () | |
{ | |
//over ride charmap | |
CSphVector<CSphRemapRange> dRemaps; | |
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) ); | |
dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) ); | |
CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps, | |
FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1 | |
From f1e1b00d312bc267d9baaa472daebee831cd298a Mon Sep 17 00:00:00 2001 | |
From: fffonion <fffonion@gmail.com> | |
Date: Mon, 28 Mar 2016 00:47:19 -0400 | |
Subject: [PATCH 2/2] remove symbols and punctuation | |
--- | |
src/sphinx.cpp | 12 ++++++------ | |
1 file changed, 6 insertions(+), 6 deletions(-) | |
diff --git a/src/sphinx.cpp b/src/sphinx.cpp | |
index 6b4c3159..ec9d7902 100644 | |
--- a/src/sphinx.cpp | |
+++ b/src/sphinx.cpp | |
@@ -6082,9 +6082,9 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 () | |
// 3040 - 30ff Hiragana/Katagana | |
// ff00 - ffff half/fullwidth forms | |
CSphVector<CSphRemapRange> dRemaps; | |
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) ); | |
m_tLC.AddRemaps ( dRemaps, | |
FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1 | |
@@ -6566,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg () | |
{ | |
//over ride charmap | |
CSphVector<CSphRemapRange> dRemaps; | |
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) ); | |
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) ); | |
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) ); | |
CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps, | |
FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1 | |
From 68b1d8e74cdf602734d0540820429955a24ad322 Mon Sep 17 00:00:00 2001 | |
From: nzinfo <limn@coreseek.com> | |
Date: Tue, 11 Aug 2015 22:01:16 +0800 | |
Subject: [PATCH] add branding | |
--- | |
.gitignore | 8 ++++++++ | |
src/indexer.cpp | 6 +++--- | |
src/searchd.cpp | 14 +++++++------- | |
src/sphinx.h | 6 +++++- | |
src/sphinxutils.cpp | 8 ++++---- | |
5 files changed, 27 insertions(+), 15 deletions(-) | |
diff --git a/.gitignore b/.gitignore | |
index 66aad130..e11b3285 100644 | |
--- a/.gitignore | |
+++ b/.gitignore | |
@@ -86,3 +86,11 @@ | |
/autom4te.cache/ | |
/config/ar-lib | |
/config/compile | |
+ | |
+# for qt-creator | |
+/*.user | |
+ | |
+# for patch | |
+*.rej | |
+*.orig | |
+ | |
diff --git a/src/indexer.cpp b/src/indexer.cpp | |
index 6bb1d05c..c2cee31d 100644 | |
--- a/src/indexer.cpp | |
+++ b/src/indexer.cpp | |
@@ -1766,7 +1766,7 @@ int main ( int argc, char ** argv ) | |
"\n" | |
"Options are:\n" | |
"--config <file>\t\tread configuration from specified file\n" | |
- "\t\t\t(default is sphinx.conf)\n" | |
+ "\t\t\t(default is csft.conf)\n" | |
"--all\t\t\treindex all configured indexes\n" | |
"--quiet\t\t\tbe quiet, only print errors\n" | |
"--verbose\t\tverbose indexing issues report\n" | |
@@ -1795,8 +1795,8 @@ int main ( int argc, char ** argv ) | |
"--keep-attrs\t\tretain attributes from the old index" | |
"\n" | |
"Examples:\n" | |
- "indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n" | |
- "indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" ); | |
+ "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n" | |
+ "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" ); | |
} | |
return 1; | |
diff --git a/src/searchd.cpp b/src/searchd.cpp | |
index d961eb61..2e647bc6 100644 | |
--- a/src/searchd.cpp | |
+++ b/src/searchd.cpp | |
@@ -21583,7 +21583,7 @@ void ShowHelp () | |
"Options are:\n" | |
"-h, --help\t\tdisplay this help message\n" | |
"-c, --config <file>\tread configuration from specified file\n" | |
- "\t\t\t(default is sphinx.conf)\n" | |
+ "\t\t\t(default is csft.conf)\n" | |
"--stop\t\t\tsend SIGTERM to currently running searchd\n" | |
"--stopwait\t\tsend SIGTERM and wait until actual exit\n" | |
"--status\t\tget ant print status variables\n" | |
@@ -21620,9 +21620,9 @@ void ShowHelp () | |
"--safetrace\t\tonly use system backtrace() call in crash reports\n" | |
"\n" | |
"Examples:\n" | |
- "searchd --config /usr/local/sphinx/etc/sphinx.conf\n" | |
+ "searchd --config /usr/local/sphinx/etc/csft.conf\n" | |
#if USE_WINDOWS | |
- "searchd --install --config c:\\sphinx\\sphinx.conf\n" | |
+ "searchd --install --config c:\\sphinx\\csft.conf\n" | |
#endif | |
); | |
} | |
@@ -22888,12 +22888,12 @@ int WINAPI ServiceMain ( int argc, char **argv ) | |
while ( !g_sConfigFile.cstr() ) | |
{ | |
#ifdef SYSCONFDIR | |
- g_sConfigFile = SYSCONFDIR "/sphinx.conf"; | |
+ g_sConfigFile = SYSCONFDIR "/"; | |
if ( sphIsReadable ( g_sConfigFile.cstr () ) ) | |
break; | |
#endif | |
- g_sConfigFile = "./sphinx.conf"; | |
+ g_sConfigFile = "./"; | |
if ( sphIsReadable ( g_sConfigFile.cstr () ) ) | |
break; | |
@@ -22904,9 +22904,9 @@ int WINAPI ServiceMain ( int argc, char **argv ) | |
if ( !g_sConfigFile.cstr () ) | |
sphFatal ( "no readable config file (looked in " | |
#ifdef SYSCONFDIR | |
- SYSCONFDIR "/sphinx.conf, " | |
+ SYSCONFDIR "/csft.conf, " | |
#endif | |
- "./sphinx.conf)." ); | |
+ "./csft.conf)." ); | |
sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () ); | |
diff --git a/src/sphinx.h b/src/sphinx.h | |
index 99a98de1..0e10bae0 100644 | |
--- a/src/sphinx.h | |
+++ b/src/sphinx.h | |
@@ -200,7 +200,7 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC | |
#endif | |
#define SPHINX_VERSION "2.2.11" SPHINX_BITS_TAG SPHINX_TAG " (" SPH_SVN_TAGREV ")" | |
-#define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n" | |
+#define SPHINX_BANNER_ORIG "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n" | |
#define SPHINX_SEARCHD_PROTO 1 | |
#define SPHINX_CLIENT_VERSION 1 | |
@@ -208,6 +208,10 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC | |
#define SPH_MAX_FILENAME_LEN 512 | |
#define SPH_MAX_FIELDS 256 | |
+#define CORESEEK_BANNER "Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n" | |
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG | |
+#define SPHINX_BANNER SPHINX_BANNER2 | |
+ | |
///////////////////////////////////////////////////////////////////////////// | |
extern int64_t g_iIndexerCurrentDocID; | |
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp | |
index 966c64af..f02d2bbf 100644 | |
--- a/src/sphinxutils.cpp | |
+++ b/src/sphinxutils.cpp | |
@@ -1574,12 +1574,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar | |
while ( !sOptConfig ) | |
{ | |
#ifdef SYSCONFDIR | |
- sOptConfig = SYSCONFDIR "/sphinx.conf"; | |
+ sOptConfig = SYSCONFDIR "/csft.conf"; | |
if ( sphIsReadable ( sOptConfig ) ) | |
break; | |
#endif | |
- sOptConfig = "./sphinx.conf"; | |
+ sOptConfig = "./csft.conf"; | |
if ( sphIsReadable ( sOptConfig ) ) | |
break; | |
@@ -1590,9 +1590,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar | |
if ( !sOptConfig ) | |
sphDie ( "no readable config file (looked in " | |
#ifdef SYSCONFDIR | |
- SYSCONFDIR "/sphinx.conf, " | |
+ SYSCONFDIR "/csft.conf, " | |
#endif | |
- "./sphinx.conf)" ); | |
+ "./csft.conf)" ); | |
if ( !bQuiet ) | |
fprintf ( stdout, "using config file '%s'...\n", sOptConfig ); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment