Created
December 8, 2014 13:09
-
-
Save iceout/03a6334be8d7fcfaf44c to your computer and use it in GitHub Desktop.
test scorePayload of PythonDefaultSimilarity in pylucene
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import lucene | |
lucene.initVM(vmargs=['-Djava.awt.headless=true']) | |
from org.apache.lucene.util import Version | |
from org.apache.lucene.index import DirectoryReader | |
from org.apache.lucene.index import IndexWriterConfig | |
from org.apache.lucene.search import IndexSearcher | |
from org.apache.lucene.index import Term | |
from org.apache.lucene.analysis.payloads import FloatEncoder | |
from org.apache.lucene.analysis.payloads import PayloadHelper | |
from org.apache.lucene.analysis.payloads import DelimitedPayloadTokenFilter | |
from org.apache.lucene.search.payloads import AveragePayloadFunction | |
from org.apache.lucene.search.payloads import PayloadTermQuery | |
from org.apache.lucene.analysis.core import WhitespaceTokenizer | |
from org.apache.lucene.analysis.core import LowerCaseFilter | |
from org.apache.lucene.index import IndexWriter | |
from org.apache.lucene.store import RAMDirectory | |
from org.apache.pylucene.analysis import PythonAnalyzer | |
from org.apache.pylucene.search.similarities import PythonDefaultSimilarity | |
from org.apache.pylucene.queryparser.classic import PythonQueryParser | |
from org.apache.lucene.document import Document | |
from org.apache.lucene.document import Field | |
docs = [ | |
"The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the lazy|2.0 brown|2.0 dogs|10.0", | |
"The quick red fox jumped over the lazy brown dogs", | |
"The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the old|2.0 brown|2.0 box|10.0", | |
"Mary|10.0 had a little|2.0 lamb|10.0 whose fleece|10.0 was|5.0 white|2.0 as snow|10.0", | |
"Mary had a little lamb whose fleece was white as snow", | |
"Mary|10.0 takes on Wolf|10.0 Restoration|10.0 project|10.0 despite ties|10.0 to sheep|10.0 farming|10.0", | |
"Mary|10.0 who lives|5.0 on a farm|10.0 is|5.0 happy|2.0 that she|10.0 takes|5.0 a walk|10.0 every day|10.0", | |
"Moby|10.0 Dick|10.0 is|5.0 a story|10.0 of a whale|10.0 and a man|10.0 obsessed|10.0", | |
"The robber|10.0 wore|5.0 a black|2.0 fleece|10.0 jacket|10.0 and a baseball|10.0 cap|10.0", | |
"The English|10.0 Springer|10.0 Spaniel|10.0 is|5.0 the best|2.0 of all dogs|10.0" | |
] | |
encoder = None | |
dir = None | |
parser = None | |
payloadSimilarity = None | |
class PayloadQueryParser(PythonQueryParser): | |
def __init__(self, matchVersion, f, a): | |
super(PayloadQueryParser, self).__init__(matchVersion, f, a) | |
def newTermQuery(self, term): | |
return PayloadTermQuery(term, AveragePayloadFunction) | |
class PayloadSimilarity(PythonDefaultSimilarity): | |
def scorePayload(self, docId, start, end, payload): | |
print start, end | |
return PayloadHelper.decodeFloat(payload.bytes, end) | |
class PayloadAnalyzer(PythonAnalyzer): | |
encoder = None | |
def __init__(self, encoder): | |
print 'init analyzer' | |
super(PayloadAnalyzer, self).__init__() | |
self.encoder = encoder | |
def createComponents(self, fieldName, reader): | |
source = WhitespaceTokenizer(Version.LUCENE_44, reader) | |
result = LowerCaseFilter(Version.LUCENE_44, source) | |
result = DelimitedPayloadTokenFilter(result, u'|', self.encoder) | |
return self.TokenStreamComponents(source, result) | |
def printResults(searcher, query, topDocs): | |
for sdoc in topDocs.scoreDocs: | |
print sdoc.toString() | |
print searcher.explain(query, sdoc.doc).toString() | |
def setup(): | |
global dir, payloadSimilarity, encoder | |
encoder = FloatEncoder() | |
# parser = PayloadQueryParser(Version.LUCENE_44, "body", planaly) | |
dir = RAMDirectory() | |
planaly = PayloadAnalyzer(encoder) | |
config = IndexWriterConfig(Version.LUCENE_44, planaly) | |
payloadSimilarity = PayloadSimilarity() | |
config.setSimilarity(payloadSimilarity) | |
writer = IndexWriter(dir, config) | |
i = 0 | |
for string in docs: | |
doc = Document() | |
id = Field("id", "doc_"+str(i), Field.Store.YES, | |
Field.Index.NOT_ANALYZED_NO_NORMS) | |
doc.add(id) | |
text = Field("body", string, Field.Store.NO, Field.Index.ANALYZED) | |
doc.add(text) | |
writer.addDocument(doc) | |
i += 1 | |
writer.close() | |
def testpay(): | |
searcher = IndexSearcher(DirectoryReader.open(dir)) | |
searcher.setSimilarity(payloadSimilarity) | |
btq = PayloadTermQuery(Term("body", "fox"), AveragePayloadFunction()) | |
topDocs = searcher.search(btq, 10) | |
printResults(searcher, btq, topDocs) | |
setup() | |
testpay() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment