PyLucene custom char Tokenizer, Analyzer
from org.apache.pylucene.analysis import PythonAnalyzer, PythonCharTokenizer
from org.apache.lucene.analysis import Analyzer
from org.apache.lucene.analysis.core import LowerCaseTokenizer, LowerCaseFilter, StopAnalyzer, StopFilter
class LucTokenizer(PythonCharTokenizer):
def __init__(self, version, input):
PythonCharTokenizer.__init__(self, version, input)
def isTokenChar(self, c):
return c >= 48 and c <= 57 or c >=65 and c <= 90 or c >=97 and c <=122
# lowercase!
def normalize(self, c):
if c >= 65 and c <= 90:
return c + 32
return c
class LucAnalyzer(PythonAnalyzer):
def __init__(self, version):
PythonAnalyzer.__init__(self, version)
def createComponents(self, field, reader):
tokenizer = LucTokenizer(Version.LUCENE_CURRENT, reader)
last = StopFilter(Version.LUCENE_CURRENT, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
return Analyzer.TokenStreamComponents(tokenizer, last)
