Skip to content

Instantly share code, notes, and snippets.

@Sennahoi
Created Oct 2, 2014
Embed
What would you like to do?
PyLucene custom char Tokenizer, Analyzer
from org.apache.pylucene.analysis import PythonAnalyzer, PythonCharTokenizer
from org.apache.lucene.analysis import Analyzer
from org.apache.lucene.analysis.core import LowerCaseTokenizer, LowerCaseFilter, StopAnalyzer, StopFilter
class LucTokenizer(PythonCharTokenizer):
def __init__(self, version, input):
PythonCharTokenizer.__init__(self, version, input)
def isTokenChar(self, c):
return c >= 48 and c <= 57 or c >=65 and c <= 90 or c >=97 and c <=122
# lowercase!
def normalize(self, c):
if c >= 65 and c <= 90:
return c + 32
else:
return c
class LucAnalyzer(PythonAnalyzer):
def __init__(self, version):
PythonAnalyzer.__init__(self, version)
def createComponents(self, field, reader):
tokenizer = LucTokenizer(Version.LUCENE_CURRENT, reader)
last = StopFilter(Version.LUCENE_CURRENT, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
return Analyzer.TokenStreamComponents(tokenizer, last)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment