Skip to content

Instantly share code, notes, and snippets.

@iceout
Created December 8, 2014 13:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iceout/03a6334be8d7fcfaf44c to your computer and use it in GitHub Desktop.
Save iceout/03a6334be8d7fcfaf44c to your computer and use it in GitHub Desktop.
test scorePayload of PythonDefaultSimilarity in pylucene
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import lucene
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
from org.apache.lucene.util import Version
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import IndexWriterConfig
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import Term
from org.apache.lucene.analysis.payloads import FloatEncoder
from org.apache.lucene.analysis.payloads import PayloadHelper
from org.apache.lucene.analysis.payloads import DelimitedPayloadTokenFilter
from org.apache.lucene.search.payloads import AveragePayloadFunction
from org.apache.lucene.search.payloads import PayloadTermQuery
from org.apache.lucene.analysis.core import WhitespaceTokenizer
from org.apache.lucene.analysis.core import LowerCaseFilter
from org.apache.lucene.index import IndexWriter
from org.apache.lucene.store import RAMDirectory
from org.apache.pylucene.analysis import PythonAnalyzer
from org.apache.pylucene.search.similarities import PythonDefaultSimilarity
from org.apache.pylucene.queryparser.classic import PythonQueryParser
from org.apache.lucene.document import Document
from org.apache.lucene.document import Field
docs = [
"The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the lazy|2.0 brown|2.0 dogs|10.0",
"The quick red fox jumped over the lazy brown dogs",
"The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the old|2.0 brown|2.0 box|10.0",
"Mary|10.0 had a little|2.0 lamb|10.0 whose fleece|10.0 was|5.0 white|2.0 as snow|10.0",
"Mary had a little lamb whose fleece was white as snow",
"Mary|10.0 takes on Wolf|10.0 Restoration|10.0 project|10.0 despite ties|10.0 to sheep|10.0 farming|10.0",
"Mary|10.0 who lives|5.0 on a farm|10.0 is|5.0 happy|2.0 that she|10.0 takes|5.0 a walk|10.0 every day|10.0",
"Moby|10.0 Dick|10.0 is|5.0 a story|10.0 of a whale|10.0 and a man|10.0 obsessed|10.0",
"The robber|10.0 wore|5.0 a black|2.0 fleece|10.0 jacket|10.0 and a baseball|10.0 cap|10.0",
"The English|10.0 Springer|10.0 Spaniel|10.0 is|5.0 the best|2.0 of all dogs|10.0"
]
encoder = None
dir = None
parser = None
payloadSimilarity = None
class PayloadQueryParser(PythonQueryParser):
def __init__(self, matchVersion, f, a):
super(PayloadQueryParser, self).__init__(matchVersion, f, a)
def newTermQuery(self, term):
return PayloadTermQuery(term, AveragePayloadFunction)
class PayloadSimilarity(PythonDefaultSimilarity):
def scorePayload(self, docId, start, end, payload):
print start, end
return PayloadHelper.decodeFloat(payload.bytes, end)
class PayloadAnalyzer(PythonAnalyzer):
encoder = None
def __init__(self, encoder):
print 'init analyzer'
super(PayloadAnalyzer, self).__init__()
self.encoder = encoder
def createComponents(self, fieldName, reader):
source = WhitespaceTokenizer(Version.LUCENE_44, reader)
result = LowerCaseFilter(Version.LUCENE_44, source)
result = DelimitedPayloadTokenFilter(result, u'|', self.encoder)
return self.TokenStreamComponents(source, result)
def printResults(searcher, query, topDocs):
for sdoc in topDocs.scoreDocs:
print sdoc.toString()
print searcher.explain(query, sdoc.doc).toString()
def setup():
global dir, payloadSimilarity, encoder
encoder = FloatEncoder()
# parser = PayloadQueryParser(Version.LUCENE_44, "body", planaly)
dir = RAMDirectory()
planaly = PayloadAnalyzer(encoder)
config = IndexWriterConfig(Version.LUCENE_44, planaly)
payloadSimilarity = PayloadSimilarity()
config.setSimilarity(payloadSimilarity)
writer = IndexWriter(dir, config)
i = 0
for string in docs:
doc = Document()
id = Field("id", "doc_"+str(i), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS)
doc.add(id)
text = Field("body", string, Field.Store.NO, Field.Index.ANALYZED)
doc.add(text)
writer.addDocument(doc)
i += 1
writer.close()
def testpay():
searcher = IndexSearcher(DirectoryReader.open(dir))
searcher.setSimilarity(payloadSimilarity)
btq = PayloadTermQuery(Term("body", "fox"), AveragePayloadFunction())
topDocs = searcher.search(btq, 10)
printResults(searcher, btq, topDocs)
setup()
testpay()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment