Skip to content

Instantly share code, notes, and snippets.

@fffonion
Created September 21, 2017 21:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fffonion/5b2b561efdde56abcfa61102f67385bb to your computer and use it in GitHub Desktop.
Save fffonion/5b2b561efdde56abcfa61102f67385bb to your computer and use it in GitHub Desktop.
Coreseek 5.1 patch
From c7db4c68750b17532e78da313c5b5d25a1d00a3e Mon Sep 17 00:00:00 2001
From: nzinfo <limn@coreseek.com>
Date: Thu, 13 Aug 2015 23:16:20 +0800
Subject: [PATCH] add mmseg support, begin fix pre-token
---
acinclude.m4 | 89 ++++++++++++++
configure.ac | 21 +++-
src/Makefile.am | 4 +
src/sphinx.cpp | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++--
src/sphinx.h | 19 ++-
src/sphinxutils.cpp | 17 ++-
src/sphinxutils.h | 1 +
7 files changed, 486 insertions(+), 12 deletions(-)
diff --git a/acinclude.m4 b/acinclude.m4
index e09697ea..3ae78b01 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -298,6 +298,95 @@ fi
])
dnl ---------------------------------------------------------------------------
+dnl Macro: AC_CHECK_MMSEG
+dnl ---------------------------------------------------------------------------
+
+AC_DEFUN([AC_CHECK_MMSEG],[
+
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg"
+ do
+ if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ]
+ then
+ MMSEG_CFLAGS="-I$CANDIDATE"
+ break
+ fi
+ done
+fi
+
+# explicit overrides will be applied later
+if test [ -z "$MMSEG_LIBS" ]
+then
+ for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \
+ "/usr/local/lib" "/usr/local/mmseg/lib" \
+ "/usr/local/lib/mmseg" "/usr/lib" \
+ "/opt/mmseg/lib"
+ do
+ if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ]
+ then
+ MMSEG_LIBS="-L$CANDIDATE -lmmseg"
+ break
+ fi
+ done
+fi
+
+# apply explicit include path overrides
+AC_ARG_WITH([mmseg-includes],
+ AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]),
+ [ac_cv_mmseg_includes=$withval])
+if test [ -n "$ac_cv_mmseg_includes" ]
+then
+ MMSEG_CFLAGS="-I$ac_cv_mmseg_includes"
+fi
+
+
+# apply explicit lib path overrides
+AC_ARG_WITH([mmseg-libs],
+ AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]),
+ [ac_cv_mmseg_libs=$withval])
+if test [ -n "$ac_cv_mmseg_libs" ]
+then
+ # Trim trailing '.libs' if user passed it in --with-mysql-libs option
+ ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \
+ -e 's+.libs/$++'`
+ MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg"
+fi
+
+# now that we did all we could, perform final checks
+AC_MSG_CHECKING([libmmseg include files])
+if test [ -z "$MMSEG_CFLAGS" ]
+then
+ AC_MSG_ERROR([missing include files.
+
+******************************************************************************
+ERROR: cannot find libmmseg include files.
+
+To disable libmmseg support, use --without-mmseg option.
+******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_CFLAGS])
+fi
+
+AC_MSG_CHECKING([libmmseg libraries])
+if test [ -z "$MMSEG_LIBS" ]
+then
+ AC_MSG_ERROR([missing libraries.
+
+******************************************************************************
+ERROR: cannot find libmmseg libraries.
+
+To disable libmmseg support, use --without-mmseg option.
++******************************************************************************
+])
+else
+ AC_MSG_RESULT([$MMSEG_LIBS])
+fi
+
+])
+
+dnl ---------------------------------------------------------------------------
dnl Macro: AC_CHECK_LIBSTEMMER
dnl Check the libstemmer first in custom include path in --with-libstemmer=*
dnl If not given, try to guess common shared libs, and finally fall back into
diff --git a/configure.ac b/configure.ac
index d56fbd95..e08dc886 100644
--- a/configure.ac
+++ b/configure.ac
@@ -69,6 +69,7 @@ fi
AC_PROG_CC
AC_PROG_CXX
+AM_PROG_AR
AC_PROG_RANLIB
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
@@ -429,6 +430,24 @@ else
fi
AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno -o x$ac_cv_use_satic_pgsql != xno )
+dnl ---
+# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support
+AC_ARG_WITH([mmseg],
+ AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]),
+ [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes]
+)
+AC_MSG_CHECKING([whether to compile with libmmseg support])
+if test x$ac_cv_use_mmseg != xno; then
+ AC_MSG_RESULT([yes])
+ AC_CHECK_MMSEG([$ac_cv_use_mmseg])
+ AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support])
+ AC_SUBST([MMSEG_LIBS])
+ AC_SUBST([MMSEG_CFLAGS])
+else
+ AC_MSG_RESULT([no])
+fi
+AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno)
+
# add macports include directory
if (echo $MYSQL_LIBS | grep -q -- -L/opt/local/lib); then
MYSQL_CFLAGS="$MYSQL_CFLAGS -I/opt/local/include"
@@ -483,7 +502,7 @@ AM_CONDITIONAL(USE_INTERNAL_LIBSTEMMER, test x$ac_cv_use_internal_libstemmer !=
dnl ---
# we can now set preprocessor flags for both C and C++ compilers
-CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS"
+CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $LIBSTEMMER_CFLAGS $MMSEG_CFLAGS"
AC_ARG_WITH([libexpat],
diff --git a/src/Makefile.am b/src/Makefile.am
index 3129f594..8e696075 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,5 +30,9 @@ RLP_INC =
endif
AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+if USE_MMSEG
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(MMSEG_LIBS)
+else
COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+endif
LDADD = $(COMMON_LIBS)
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 6c4a4097..70dceaf6 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -141,6 +141,16 @@
#pragma message("Automatically linking with btutils.lib")
#endif
+#if ( USE_WINDOWS && USE_MMSEG )
+ #if _DEBUG
+ #pragma comment(linker, "/defaultlib:libcss_d.lib")
+ #else
+ #pragma comment(linker, "/defaultlib:libcss.lib")
+ #endif
+ #pragma message("Automatically linking with libcss.lib")
+ #pragma warning(disable:4530) // for ugly mmseg
+#endif
+
/////////////////////////////////////////////////////////////////////////////
// logf() is not there sometimes (eg. Solaris 9)
@@ -2552,10 +2562,14 @@ class CSphTokenizer_UTF8 : public CSphTokenizerBase2
public:
CSphTokenizer_UTF8 ();
virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual bool IsPreTokenized() { return m_bPreTokenized; }
virtual BYTE * GetToken ();
virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
virtual int GetCodepointLength ( int iCode ) const;
virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+protected:
+ bool m_bPreTokenized;
};
@@ -2576,6 +2590,78 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
CSphString m_sNgramCharsStr;
};
+#if USE_MMSEG
+
+#include "SegmenterManager.h"
+#include "Segmenter.h"
+
+typedef CSR_Singleton<css::SegmenterManager> SegmenterManagerSingleInstance;
+
+template < bool IS_QUERY >
+class CSphTokenizer_UTF8MMSeg : public CSphTokenizer_UTF8<IS_QUERY>
+{
+public:
+ CSphTokenizer_UTF8MMSeg ();
+ ~CSphTokenizer_UTF8MMSeg() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ }
+
+ virtual void SetBuffer ( const BYTE * sBuffer, int iLength );
+ virtual BYTE * GetToken ();
+ virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
+ virtual const BYTE* GetThesaurus(BYTE * sBuffer, int iLength );
+ bool IsSegment(const BYTE * pCur);
+
+ CSphTokenizerBase* SetDictPath(const char* path) { m_dictpath = path; return this; }
+
+ virtual const char * GetBufferPtr () const { return (const char *) CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur; }
+
+ virtual const char * GetTokenStart () const { return m_segToken; }
+
+ virtual int GetLastTokenLen () const { return m_iLastTokenLenMMSeg; }
+
+ virtual void ReloadSegDictionary() {
+ if(m_seg){
+ SafeDelete ( m_seg );
+ }
+ m_seg = NULL;
+
+ if(m_mgr) {
+ SegmenterManagerSingleInstance::Free(); // free preexist instance.
+ m_mgr = NULL;
+ }
+ }
+protected:
+ char* m_segToken;
+ size_t m_segoffset;
+ int m_iLastTokenLenMMSeg;
+ BYTE m_sAccumSeg [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator
+ BYTE * m_pAccumSeg; ///< current accumulator position
+ CSphVector<u2> m_tokenlens;
+ int m_tokenpos;
+protected:
+ // virtual bool IsSegment(const BYTE * pCur);
+ CSphString m_dictpath;
+
+ // mmseg related
+ css::Segmenter* m_seg;
+ css::SegmenterManager* m_mgr;
+ css::Segmenter* GetSegmenter(const char* dict_path){
+ int nRet = 0;
+ if(!m_mgr) {
+ m_mgr = SegmenterManagerSingleInstance::Get();
+ if(dict_path)
+ nRet = m_mgr->init(dict_path);
+ }
+ if(nRet == 0 && !m_seg)
+ m_seg = m_mgr->getSegmenter(false);
+ return m_seg;
+ }
+};
+
+#endif
struct CSphNormalForm
{
@@ -3795,6 +3881,15 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
return new CSphTokenizer_UTF8Ngram<false> ();
}
+#if USE_MMSEG
+ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path )
+{
+ CSphTokenizer_UTF8MMSeg<false>* tokenizer = new CSphTokenizer_UTF8MMSeg<false> ();
+ tokenizer->SetDictPath(dict_path);
+ return tokenizer;
+}
+#endif
+
/////////////////////////////////////////////////////////////////////////////
enum
@@ -4380,6 +4475,7 @@ CSphTokenizerSettings::CSphTokenizerSettings ()
: m_iType ( TOKENIZER_UTF8 )
, m_iMinWordLen ( 1 )
, m_iNgramLen ( 0 )
+ , m_iDebug ( 0 )
{
}
@@ -4391,7 +4487,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
return true;
tSettings.m_iType = tReader.GetByte ();
- if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+#if USE_MMSEG
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_ZHCN_UTF8)
+#else
+ if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM)
+#endif
{
sWarning = "can't load an old index with SBCS tokenizer";
return false;
@@ -4419,7 +4519,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
tSettings.m_sIgnoreChars = tReader.GetString ();
tSettings.m_iNgramLen = tReader.GetDword ();
tSettings.m_sNgramChars = tReader.GetString ();
- if ( uVersion>=15 )
+#if USE_MMSEG
+ //mmseg --coreseek, mmseg option make coreseek & sphinx's index is NOT the same.
+ tSettings.m_sDictPath = tReader.GetString ();
+#endif
+ if ( uVersion>=15 )
tSettings.m_sBlendChars = tReader.GetString ();
if ( uVersion>=24 )
tSettings.m_sBlendMode = tReader.GetString();
@@ -4450,6 +4554,10 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
tWriter.PutDword ( tSettings.m_iNgramLen );
tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
+#if USE_MMSEG
+ // if turn mmseg off, the index(s) are compat again.
+ tWriter.PutString ( tSettings.m_sDictPath.cstr () );
+#endif
tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
}
@@ -4724,6 +4832,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
{
case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break;
case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break;
+#if USE_MMSEG
+ case TOKENIZER_ZHCN_UTF8: pTokenizer = sphCreateUTF8ChineseTokenizer(tSettings.m_sDictPath.cstr()); break;
+#endif
default:
sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
return NULL;
@@ -5963,7 +6074,20 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
{
CSphString sTmp;
SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
- m_bHasBlend = false;
+
+ // BEGIN CJK There is no case folding, should do this in remote tokenizer.
+ // Here just make CJK Charactor will remain. --coreseek
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+
+ m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
+ // ENDCJK
+ m_bPreTokenized = false; // by default use original route.
+
+ m_bHasBlend = false;
}
@@ -5973,10 +6097,29 @@ void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
// check that old one is over and that new length is sane
assert ( iLength>=0 );
- // set buffer
+ // set buffer
m_pBuffer = sBuffer;
+ // check is pre-segment buffer, with prefix 0xFFFA
+ // if True, the following should be 0xFFA, 0x41, [ctx] --coreseek
+ m_bPreTokenized = false;
+ if(iLength > 4)
+ {
+ // there is a ' ' (space, 32) as padding. might not true
+ unsigned char mask[] = {32, 239, 191, 186, 65};
+ unsigned char mask_bare[] = {239, 191, 186, 65};
+ if(strncmp( (const char *)mask, (const char *)sBuffer, 5) == 0) {
+ // 0xFFFA is a magic number , if it's in head, mark this buffer pre-tokenized.
+ m_bPreTokenized = true;
+ m_pBuffer += 5;
+ }else
+ if(strncmp( (const char *)mask_bare, (const char *)sBuffer, 4) == 0) {
+ m_bPreTokenized = true;
+ m_pBuffer += 4;
+ }
+ }
+
m_pBufferMax = sBuffer + iLength;
- m_pCur = sBuffer;
+ m_pCur = m_pBuffer;
m_pTokenStart = m_pTokenEnd = NULL;
m_pBlendStart = m_pBlendEnd = NULL;
@@ -5994,7 +6137,7 @@ BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
m_bTokenBoundary = false;
m_bWasSynonym = false;
- return m_bHasBlend
+ return m_bHasBlend
? DoGetToken<IS_QUERY,true>()
: DoGetToken<IS_QUERY,false>();
}
@@ -6409,6 +6552,152 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
assert ( m_iNgramLen==1 );
return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
}
+//////////////////////////////////////////////////////////////////////////
+#if USE_MMSEG
+//////////////////////////////////////////////////////////////////////////
+template < bool IS_QUERY >
+CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
+ :CSphTokenizer_UTF8<IS_QUERY>()
+ , m_segoffset(0)
+{
+ //over ride charmap
+ CSphVector<CSphRemapRange> dRemaps;
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+
+ CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
+ FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
+ m_pAccumSeg = m_sAccumSeg;
+ //m_iLastTokenBufferLen = 0;
+ m_iLastTokenLenMMSeg = 0;
+
+ m_mgr = NULL;
+ m_seg = NULL;
+ m_tokenlens.Reserve(1024*512); // resize to 512K
+}
+
+template < bool IS_QUERY >
+void CSphTokenizer_UTF8MMSeg<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+ CSphTokenizer_UTF8<IS_QUERY>::SetBuffer(sBuffer, iLength);
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ seg->setBuffer((u1*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pBuffer, iLength);
+ else
+ sphDie ( " Tokenizer initialization failure. " );
+ m_segoffset = 0;
+ m_segToken = (char*)CSphTokenizer_UTF8MMSeg<IS_QUERY>::m_pCur;
+
+ m_tokenlens.Reset();
+ m_tokenpos = 0;
+ {
+ u2 len = 0, symlen = 0;
+ while(1){
+ len = 0;
+ char* tok = (char*)seg->peekToken(len,symlen);
+ if(!tok || !*tok || !len)
+ break;
+ seg->popToken(len);
+
+ m_tokenlens.Add(len);
+ //printf("%*.*s/p ",symlen,symlen,tok);
+ }
+ }
+}
+
+template < bool IS_QUERY >
+bool CSphTokenizer_UTF8MMSeg<IS_QUERY>::IsSegment(const BYTE * pCur)
+{
+ // this code might have bug, but as it will removed in next release...
+ size_t offset = pCur - CSphTokenizer_UTF8<IS_QUERY>::m_pBuffer;
+ //if(offset == 0) return false;
+ //printf("pcur: %s\n", pCur);
+
+ //css::Segmenter* seg = GetSegmenter(m_dictpath.cstr()); //TODO fill blank here
+ {
+ u2 len = 0, symlen = 0;
+ while(m_segoffset < offset) {
+ //tok = (const char*)seg->peekToken(len, symlen);
+ //seg->popToken(len);
+ len = m_tokenlens[m_tokenpos];
+ m_tokenpos ++;
+ m_segoffset += len;
+ //printf("tok: %*.*s, len=%d\t ",len,len,tok, len);
+ if(m_tokenpos >= m_tokenlens.GetLength() || len==0){
+ //break?
+ break;
+ }
+ }
+ /*
+ printf("\n");
+ printf("seg_off %d vs off %d\n", m_segoffset, offset);
+ if(m_segoffset != offset)
+ printf("seg_pcur: %s\n", pCur);
+ */
+ return (m_segoffset == offset);
+ } //end if seg
+ return true;
+}
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetToken ()
+{
+ //return CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ m_iLastTokenLenMMSeg = 0;
+ //BYTE* tok = CSphTokenizer_UTF8::GetToken();
+ while(!IsSegment(CSphTokenizer_UTF8<IS_QUERY>::m_pCur) || m_pAccumSeg == m_sAccumSeg)
+ {
+ BYTE* tok = CSphTokenizer_UTF8<IS_QUERY>::GetToken();
+ //printf("utf8_token: %s \t ", tok);
+ if(!tok){
+ m_iLastTokenLenMMSeg = 0;
+ return NULL;
+ }
+
+ int token_buf_len = strlen((const char*)tok);
+
+ if(m_pAccumSeg == m_sAccumSeg)
+ m_segToken = (char*)CSphTokenizer_UTF8<IS_QUERY>::m_pTokenStart;
+
+ if ( (m_pAccumSeg - m_sAccumSeg)<SPH_MAX_WORD_LEN ) {
+ ::memcpy(m_pAccumSeg, tok, token_buf_len);
+ m_pAccumSeg += token_buf_len;
+ m_iLastTokenLenMMSeg += CSphTokenizer_UTF8<IS_QUERY>::GetLastTokenLen();
+ }
+ }
+ {
+ *m_pAccumSeg = 0;
+ //m_iLastTokenBufferLen = m_pAccumSeg - m_sAccumSeg;
+ m_pAccumSeg = m_sAccumSeg;
+
+ return m_sAccumSeg;
+ }
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_UTF8MMSeg<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+ CSphTokenizerBase * pClone;
+ if ( eMode!=SPH_CLONE_INDEX ) {
+ pClone = (new CSphTokenizer_UTF8MMSeg<true>())->SetDictPath(m_dictpath.cstr());
+ }else{
+ pClone = (new CSphTokenizer_UTF8MMSeg<false>())->SetDictPath(m_dictpath.cstr());
+ }
+ pClone->CloneBase ( this, eMode );
+ return pClone;
+}
+
+template < bool IS_QUERY >
+const BYTE* CSphTokenizer_UTF8MMSeg<IS_QUERY>::GetThesaurus(BYTE * sBuffer, int iLength )
+{
+ css::Segmenter* seg = GetSegmenter(m_dictpath.cstr());
+ if(seg)
+ return (const BYTE*)seg->thesaurus((const char*)sBuffer, iLength);
+ return NULL;
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
@@ -9658,6 +9947,7 @@ void CSphIndex::SetupQueryTokenizer()
// create and setup a master copy of query time tokenizer
// that we can then use to create lightweight clones
SafeDelete ( m_pQueryTokenizer );
+ m_pTokenizer->ReloadSegDictionary();
m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
if ( IsStarDict() )
{
@@ -25810,6 +26100,7 @@ void CSphSource::Setup ( const CSphSourceSettings & tSettings )
m_bIndexExactWords = tSettings.m_bIndexExactWords;
m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
+ m_bDebugDump = tSettings.m_bDebugDump; //coreseek: assign debug charset setting
m_bIndexSP = tSettings.m_bIndexSP;
m_dPrefixFields = tSettings.m_dPrefixFields;
m_dInfixFields = tSettings.m_dInfixFields;
@@ -26414,11 +26705,28 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
{
+ //FIXME: dump token to console --coreseek
+ //debug dump
+ if(m_pTokenizer->DumpToken()) {
+ printf("%s_x ", sWord); // make the same as pre-tokenized text.
+ }
+
+ // fix sWork if in pre-tokenized mode.
+ int iBytes = strlen ( (const char*)sWord );
+ bool bAdvancePos = true;
+ if(m_pTokenizer->IsPreTokenized()) {
+ // m_tState.m_iHitPos should not be 0, add for some stupid pass a none _x token at the very beginning.
+ if(sWord[iBytes-1] != 'x' && m_tState.m_iHitPos)
+ bAdvancePos = false; // not an advance token.
+ sWord[iBytes-2] = '\0'; // change token_x -> token\0x
+ iBytes -= 2; // decrease length
+ }
+
m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
- if ( !bPayload )
+ if ( !bPayload && bAdvancePos)
{
HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
if ( m_pTokenizer->GetBoundary() )
@@ -26430,7 +26738,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
if ( bGlobalPartialMatch )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD;
sBuf[iBytes+1] = '\0';
@@ -26440,7 +26748,7 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
{
- int iBytes = strlen ( (const char*)sWord );
+ //int iBytes = strlen ( (const char*)sWord );
memcpy ( sBuf + 1, sWord, iBytes );
sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
sBuf[iBytes+1] = '\0';
@@ -26476,6 +26784,27 @@ void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, b
m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
} else
m_tState.m_iBuildLastStep = m_iStopwordStep;
+#if USE_MMSEG
+ // works only when mmseg is on.
+ // zh_cn only GetThesaurus
+ {
+ int iBytes = strlen ( (const char*)sWord );
+ const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
+ if(tbuf_ptr) {
+ while(*tbuf_ptr) {
+ size_t len = strlen((const char*)tbuf_ptr);
+ SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
+ if ( iWord ) {
+ m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
+ // mmseg; do not inc step for we are in 'one' hit.
+ //m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
+ }
+ tbuf_ptr += len + 1; //move next
+ }
+ }
+ //end if buf
+ }//end GetThesaurus
+#endif
}
m_tState.m_bProcessingHits = ( sWord!=NULL );
diff --git a/src/sphinx.h b/src/sphinx.h
index 0e10bae0..160c1677 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -28,6 +28,7 @@
#define USE_RE2 0 /// whether to compile RE2 support
#define USE_RLP 0 /// whether to compile RLP support
#define USE_WINDOWS 1 /// whether to compile for Windows
+ #define USE_MMSEG 1 /// enable mmseg
#define USE_SYSLOG 0 /// whether to use syslog for logging
#define UNALIGNED_RAM_ACCESS 1
@@ -495,7 +496,10 @@ struct CSphTokenizerSettings
CSphString m_sBlendChars;
CSphString m_sBlendMode;
CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output
-
+ int m_iDebug; ///< is in tokenizer debug mode.
+#if USE_MMSEG
+ CSphString m_sDictPath; ///coreseek: where to find segmentor's dict.
+#endif
CSphTokenizerSettings ();
};
@@ -606,11 +610,16 @@ class ISphTokenizer
/// get synonym file info
virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
+ /// mark as debug tokenizer's output --coreseek -mmseg
+ virtual int DumpToken () { return m_tSettings.m_iDebug; }
public:
/// pass next buffer
virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
+ /// is pre-tokenized --coreseek
+ virtual bool IsPreTokenized() { return false; }
+
/// set current index schema (only intended for the token filter plugins)
virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
@@ -693,6 +702,10 @@ class ISphTokenizer
/// set new buffer ptr (must be within current bounds)
virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
+#if USE_MMSEG
+ virtual const BYTE* GetThesaurus(BYTE * , int ) { return NULL; }
+ virtual void ReloadSegDictionary() { return; } // reload mmseg's dictionary.
+#endif
/// get settings hash
virtual uint64_t GetSettingsFNV () const;
@@ -717,6 +730,9 @@ class ISphTokenizer
CSphLowercaser m_tLC; ///< my lowercaser
int m_iLastTokenLen; ///< last token length, in codepoints
bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator)
+#if USE_MMSEG
+ int m_iLastTokenBufferLen; ///< the buffer length -- coreseek; use in mmseg patch.
+#endif
bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint)
int m_iBoundaryOffset; ///< boundary character offset (in bytes)
bool m_bWasSpecial; ///< special token flag
@@ -1822,6 +1838,7 @@ struct CSphSourceSettings
int m_iStopwordStep; ///< position step on stopword token (default is 1)
bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters
bool m_bIndexFieldLens; ///< whether to index field lengths
+ int m_bDebugDump; ///< mmseg charset debug output feature
CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields
CSphVector<CSphString> m_dInfixFields; ///< list of infix fields
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index f02d2bbf..a8ddbc03 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -407,6 +407,8 @@ static KeyDesc_t g_dKeysIndex[] =
{ "min_word_len", 0, NULL },
{ "charset_type", KEY_REMOVED, NULL },
{ "charset_table", 0, NULL },
+ { "charset_dictpath", 0, NULL }, //coreseek: mmseg's dictionary path
+ { "charset_debug", 0, NULL }, //coreseek: debug output tokens
{ "ignore_chars", 0, NULL },
{ "min_prefix_len", 0, NULL },
{ "min_infix_len", 0, NULL },
@@ -1133,7 +1135,10 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
{
tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 );
- if ( hIndex ( "ngram_chars" ) )
+ if(hIndex("charset_debug"))
+ tSettings.m_iDebug = hIndex["charset_debug"].intval();
+
+ if ( hIndex ( "ngram_chars" ) )
{
if ( tSettings.m_iNgramLen )
tSettings.m_iType = TOKENIZER_NGRAM;
@@ -1141,6 +1146,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
}
+#if USE_MMSEG
+ //XXX:fixme : sphinx changes tokenizer create process
+ if (hIndex("charset_dictpath") && CSphString(hIndex.GetStr("charset_type")) =="zh_cn.utf-8" )
+ {
+ tSettings.m_sDictPath = hIndex.GetStr("charset_dictpath");
+ tSettings.m_iType = TOKENIZER_ZHCN_UTF8;
+ }
+#endif
+
tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
@@ -1274,6 +1288,7 @@ bool sphConfIndex ( const CSphConfigSection & hIndex, CSphIndexSettings & tSetti
tSettings.m_iEmbeddedLimit = hIndex.GetSize ( "embedded_limit", 16384 );
tSettings.m_bIndexFieldLens = hIndex.GetInt ( "index_field_lengths" )!=0;
tSettings.m_sIndexTokenFilter = hIndex.GetStr ( "index_token_filter" );
+ tSettings.m_bDebugDump = hIndex.GetInt ( "charset_debug" )!=0;
// prefix/infix fields
CSphString sFields;
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index 5b433f2d..047997df 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -146,6 +146,7 @@ enum
// where was TOKENIZER_SBCS=1 once
TOKENIZER_UTF8 = 2,
TOKENIZER_NGRAM = 3
+ , TOKENIZER_ZHCN_UTF8 = 4
};
/// load config file
From cf043274c0b5ca3700b50ecd14c500d7570800d1 Mon Sep 17 00:00:00 2001
From: fffonion <fffonion@gmail.com>
Date: Tue, 15 Mar 2016 00:05:47 -0400
Subject: [PATCH 1/2] add hiragana and katagana into dRemaps
---
src/sphinx.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 70dceaf6..6b4c3159 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6077,10 +6077,14 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
// BEGIN CJK There is no case folding, should do this in remote tokenizer.
// Here just make CJK Charactor will remain. --coreseek
+ // 4e00 - 9fff CJK unified ideographs
+ // 3000 - 303f CJK symbols and punctuation
+ // 3040 - 30ff Hiragana/Katagana
+ // ff00 - ffff half/fullwidth forms
CSphVector<CSphRemapRange> dRemaps;
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
m_tLC.AddRemaps ( dRemaps,
FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
@@ -6562,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
{
//over ride charmap
CSphVector<CSphRemapRange> dRemaps;
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9fff, 0x4e00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x303F, 0x3000 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
From f1e1b00d312bc267d9baaa472daebee831cd298a Mon Sep 17 00:00:00 2001
From: fffonion <fffonion@gmail.com>
Date: Mon, 28 Mar 2016 00:47:19 -0400
Subject: [PATCH 2/2] remove symbols and punctuation
---
src/sphinx.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 6b4c3159..ec9d7902 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6082,9 +6082,9 @@ CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
// 3040 - 30ff Hiragana/Katagana
// ff00 - ffff half/fullwidth forms
CSphVector<CSphRemapRange> dRemaps;
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
- dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
m_tLC.AddRemaps ( dRemaps,
FLAG_CODEPOINT_NGRAM ); // !COMMIT support other n-gram lengths than 1
@@ -6566,9 +6566,9 @@ CSphTokenizer_UTF8MMSeg<IS_QUERY>::CSphTokenizer_UTF8MMSeg ()
{
//over ride charmap
CSphVector<CSphRemapRange> dRemaps;
- dRemaps.Add ( CSphRemapRange ( 0x4e00, 0x9FFF, 0x4e00 ) );
- dRemaps.Add ( CSphRemapRange ( 0xFF00, 0xFFFF, 0xFF00 ) );
- dRemaps.Add ( CSphRemapRange ( 0x3000, 0x30FF, 0x3000 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x4E00, 0x9FFF, 0x4E00 ) );
+ dRemaps.Add ( CSphRemapRange ( 0xFF10, 0xFFFF, 0xFF10 ) );
+ dRemaps.Add ( CSphRemapRange ( 0x3040, 0x30FF, 0x3040 ) );
CSphTokenizer_UTF8<IS_QUERY>::m_tLC.AddRemaps ( dRemaps,
FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL); // !COMMIT support other n-gram lengths than 1
From 68b1d8e74cdf602734d0540820429955a24ad322 Mon Sep 17 00:00:00 2001
From: nzinfo <limn@coreseek.com>
Date: Tue, 11 Aug 2015 22:01:16 +0800
Subject: [PATCH] add branding
---
.gitignore | 8 ++++++++
src/indexer.cpp | 6 +++---
src/searchd.cpp | 14 +++++++-------
src/sphinx.h | 6 +++++-
src/sphinxutils.cpp | 8 ++++----
5 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/.gitignore b/.gitignore
index 66aad130..e11b3285 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,11 @@
/autom4te.cache/
/config/ar-lib
/config/compile
+
+# for qt-creator
+/*.user
+
+# for patch
+*.rej
+*.orig
+
diff --git a/src/indexer.cpp b/src/indexer.cpp
index 6bb1d05c..c2cee31d 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -1766,7 +1766,7 @@ int main ( int argc, char ** argv )
"\n"
"Options are:\n"
"--config <file>\t\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--all\t\t\treindex all configured indexes\n"
"--quiet\t\t\tbe quiet, only print errors\n"
"--verbose\t\tverbose indexing issues report\n"
@@ -1795,8 +1795,8 @@ int main ( int argc, char ** argv )
"--keep-attrs\t\tretain attributes from the old index"
"\n"
"Examples:\n"
- "indexer --quiet myidx1\treindex 'myidx1' defined in 'sphinx.conf'\n"
- "indexer --all\t\treindex all indexes defined in 'sphinx.conf'\n" );
+ "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
+ "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
}
return 1;
diff --git a/src/searchd.cpp b/src/searchd.cpp
index d961eb61..2e647bc6 100644
--- a/src/searchd.cpp
+++ b/src/searchd.cpp
@@ -21583,7 +21583,7 @@ void ShowHelp ()
"Options are:\n"
"-h, --help\t\tdisplay this help message\n"
"-c, --config <file>\tread configuration from specified file\n"
- "\t\t\t(default is sphinx.conf)\n"
+ "\t\t\t(default is csft.conf)\n"
"--stop\t\t\tsend SIGTERM to currently running searchd\n"
"--stopwait\t\tsend SIGTERM and wait until actual exit\n"
"--status\t\tget ant print status variables\n"
@@ -21620,9 +21620,9 @@ void ShowHelp ()
"--safetrace\t\tonly use system backtrace() call in crash reports\n"
"\n"
"Examples:\n"
- "searchd --config /usr/local/sphinx/etc/sphinx.conf\n"
+ "searchd --config /usr/local/sphinx/etc/csft.conf\n"
#if USE_WINDOWS
- "searchd --install --config c:\\sphinx\\sphinx.conf\n"
+ "searchd --install --config c:\\sphinx\\csft.conf\n"
#endif
);
}
@@ -22888,12 +22888,12 @@ int WINAPI ServiceMain ( int argc, char **argv )
while ( !g_sConfigFile.cstr() )
{
#ifdef SYSCONFDIR
- g_sConfigFile = SYSCONFDIR "/sphinx.conf";
+ g_sConfigFile = SYSCONFDIR "/";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
#endif
- g_sConfigFile = "./sphinx.conf";
+ g_sConfigFile = "./";
if ( sphIsReadable ( g_sConfigFile.cstr () ) )
break;
@@ -22904,9 +22904,9 @@ int WINAPI ServiceMain ( int argc, char **argv )
if ( !g_sConfigFile.cstr () )
sphFatal ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)." );
+ "./csft.conf)." );
sphInfo ( "using config file '%s'...", g_sConfigFile.cstr () );
diff --git a/src/sphinx.h b/src/sphinx.h
index 99a98de1..0e10bae0 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -200,7 +200,7 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#endif
#define SPHINX_VERSION "2.2.11" SPHINX_BITS_TAG SPHINX_TAG " (" SPH_SVN_TAGREV ")"
-#define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
+#define SPHINX_BANNER_ORIG "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2015, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
@@ -208,6 +208,10 @@ inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOC
#define SPH_MAX_FILENAME_LEN 512
#define SPH_MAX_FIELDS 256
+#define CORESEEK_BANNER "Coreseek FullText Search 5.1 \nCopyright (c) 2008-2015, Beijing Choice Software Technologies Inc (http://www.coreseek.com)\n\n"
+#define SPHINX_BANNER2 "" CORESEEK_BANNER "" SPHINX_BANNER_ORIG
+#define SPHINX_BANNER SPHINX_BANNER2
+
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 966c64af..f02d2bbf 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -1574,12 +1574,12 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
while ( !sOptConfig )
{
#ifdef SYSCONFDIR
- sOptConfig = SYSCONFDIR "/sphinx.conf";
+ sOptConfig = SYSCONFDIR "/csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
#endif
- sOptConfig = "./sphinx.conf";
+ sOptConfig = "./csft.conf";
if ( sphIsReadable ( sOptConfig ) )
break;
@@ -1590,9 +1590,9 @@ const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigPar
if ( !sOptConfig )
sphDie ( "no readable config file (looked in "
#ifdef SYSCONFDIR
- SYSCONFDIR "/sphinx.conf, "
+ SYSCONFDIR "/csft.conf, "
#endif
- "./sphinx.conf)" );
+ "./csft.conf)" );
if ( !bQuiet )
fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment