Skip to content

Instantly share code, notes, and snippets.

@Sennahoi
Created October 2, 2014 06:39
Show Gist options
  • Save Sennahoi/740753384999add46fc1 to your computer and use it in GitHub Desktop.
Save Sennahoi/740753384999add46fc1 to your computer and use it in GitHub Desktop.
PyLucene custom char Tokenizer, Analyzer
from org.apache.pylucene.analysis import PythonAnalyzer, PythonCharTokenizer
from org.apache.lucene.analysis import Analyzer
from org.apache.lucene.analysis.core import LowerCaseTokenizer, LowerCaseFilter, StopAnalyzer, StopFilter
class LucTokenizer(PythonCharTokenizer):
def __init__(self, version, input):
PythonCharTokenizer.__init__(self, version, input)
def isTokenChar(self, c):
return c >= 48 and c <= 57 or c >=65 and c <= 90 or c >=97 and c <=122
# lowercase!
def normalize(self, c):
if c >= 65 and c <= 90:
return c + 32
else:
return c
class LucAnalyzer(PythonAnalyzer):
def __init__(self, version):
PythonAnalyzer.__init__(self, version)
def createComponents(self, field, reader):
tokenizer = LucTokenizer(Version.LUCENE_CURRENT, reader)
last = StopFilter(Version.LUCENE_CURRENT, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
return Analyzer.TokenStreamComponents(tokenizer, last)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment