Skip to content

Instantly share code, notes, and snippets.

@jorgehatccrma
Last active April 12, 2019 09:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorgehatccrma/c90fcd4c873eb8bf8933e99a20c0819e to your computer and use it in GitHub Desktop.
Save jorgehatccrma/c90fcd4c873eb8bf8933e99a20c0819e to your computer and use it in GitHub Desktop.
Lucene + Jython
"""
This simple Jython script shows how to use Apache Lucene
directly in a Jython script
"""
# your usual Python imports
import sys
from contextlib import contextmanager
# add jars to classpath
jars = [
"lucene-7.1.0/core/lucene-core-7.1.0.jar",
"lucene-7.1.0/queryparser/lucene-queryparser-7.1.0.jar",
]
for jar in jars:
sys.path.append(jar)
# Now that jars are in the path, we can import java code as if it
# was regular Python!
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document
from org.apache.lucene.document import Field
from org.apache.lucene.document import StringField
from org.apache.lucene.document import TextField
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import IndexWriter
from org.apache.lucene.index import IndexWriterConfig
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.store import RAMDirectory
@contextmanager
def closing(thing):
"""
Simple wrapper to make Lucene's classes appear more pythonic.
"""
try:
yield thing
finally:
thing.close()
def make_index(analyzer):
""" Create an inverted index to power the search. """
def add_doc(w, title, isbn):
""" Utility to add "documents" to the index. """
doc = Document()
doc.add(TextField("title", title, Field.Store.YES))
# use a string field for isbn because we don't
# want it tokenized
doc.add(StringField("isbn", isbn, Field.Store.YES))
w.addDocument(doc)
# create the index
index = RAMDirectory()
config = IndexWriterConfig(analyzer)
with closing(IndexWriter(index, config)) as w:
add_doc(w, "Lucene in Action", "193398817")
add_doc(w, "Lucene for Dummies", "55320055Z")
add_doc(w, "Managing Gigabytes", "55063554A")
add_doc(w, "The Art of Computer Science", "9900333X")
return index
def query(querystr, index, analyzer):
""" Search for the `querystr` in the index. """
# the "title" arg specifies the default field to use
# when no field is explicitly specified in the query.
q = QueryParser("title", analyzer).parse(querystr)
# search
hitsPerPage = 10
with closing(DirectoryReader.open(index)) as reader:
searcher = IndexSearcher(reader)
docs = searcher.search(q, hitsPerPage)
hits = docs.scoreDocs
# display results (needs reader to be open)
print("Found {:d} hits.".format(len(hits)))
for i, hit in enumerate(hits):
docId = hit.doc
d = searcher.doc(docId)
print("{:d}. {}\t{}".format(i + 1, d.get("isbn"), d.get("title")))
if __name__ == "__main__":
# Specify the analyzer for tokenizing text.
# The same analyzer should be used for indexing and searching
analyzer = StandardAnalyzer()
# create the index to search
index = make_index(analyzer)
# perform a search
querystr = sys.argv[1] if len(sys.argv) > 1 else "lucene"
query(querystr, index, analyzer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment