Skip to content

Instantly share code, notes, and snippets.

@yurinnick
Created September 8, 2014 06:33
Show Gist options
  • Save yurinnick/70daead1b2b5016000dd to your computer and use it in GitHub Desktop.
Save yurinnick/70daead1b2b5016000dd to your computer and use it in GitHub Desktop.
import lucene
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.analysis import Analyzer
from org.apache.lucene.analysis.core import SimpleAnalyzer, LowerCaseFilter
from org.apache.lucene.analysis.synonym import SynonymFilter, \
WordnetSynonymParser
from org.apache.lucene.analysis.standard import StandardAnalyzer, \
ClassicTokenizer, StandardFilter
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery, \
IndexSearcher, PhraseQuery
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, \
IndexReader, Term
from org.apache.lucene.util import Version
from java.io import File, FileInputStream, InputStreamReader
import sys
import os
import json
from prettytable import PrettyTable
class SimpleAnalyzer(SimpleAnalyzer):
def __init__(self, *args):
super(Analyzer, self).__init__(*args)
def createComponents(self, fieldName, reader):
source = ClassicTokenizer(Version.LUCENE_47, reader)
token_filter = StandardFilter(Version.LUCENE_47, source)
token_filter = LowerCaseFilter(Version.LUCENE_47, token_filter)
mySynonymMap = self.buildSynonym()
print 'TEST'
synonym_filter = SynonymFilter(token_filter, mySynonymMap, False)
return Analyzer.TokenStreamComponents(source, synonym_filter)
def buildSynonym(self):
stream = FileInputStream(File("wn_s.pl"))
rulesReader = InputStreamReader(stream)
parser = WordnetSynonymParser(
True, True, StandardAnalyzer(Version.LUCENE_47))
parser.add(rulesReader)
synonymMap = parser.build()
return synonymMap
class AmazonIndexer:
def __init__(self, override=True):
self.lucene = lucene.initVM()
self.indexDir = "/tmp/bookindex.index-dir/"
self.analyzer = SimpleAnalyzer(Version.LUCENE_47)
if override:
if os.path.isdir(self.indexDir):
for fileName in os.listdir(self.indexDir):
os.remove(self.indexDir + "/" + fileName)
else:
os.mkdir(self.indexDir)
def index(self, path):
index = SimpleFSDirectory(File(self.indexDir))
indexWriterConfig = IndexWriterConfig(
Version.LUCENE_47,
LimitTokenCountAnalyzer(self.analyzer, 512))
writer = IndexWriter(index, indexWriterConfig)
print >> sys.stderr, \
"Currently there are %d documents in the index" % writer.numDocs()
print >> sys.stderr, "Reading lines from %s..." % path
for line in open(path).readlines():
doc_json = json.loads(line)
doc = Document()
doc.add(Field(
'author',
doc_json['author'],
Field.Store.YES,
Field.Index.ANALYZED))
doc.add(Field(
'title',
doc_json['title'],
Field.Store.YES,
Field.Index.ANALYZED))
doc.add(Field(
'url',
doc_json['link'],
Field.Store.YES,
Field.Index.NO))
doc.add(Field(
'price',
doc_json['price'],
Field.Store.YES,
Field.Index.NO))
writer.addDocument(doc)
print >> sys.stderr, \
"Indexed lines from %s (%d documents in index)" % \
(path, writer.numDocs())
print >> sys.stderr, \
"About to optimize index of %d documents..." % writer.numDocs()
writer.forceMerge(1)
print >> sys.stderr, \
"...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, \
"Closing index of %d documents..." % writer.numDocs()
print >> sys.stderr, \
"...done closing index of %d documents" % writer.numDocs()
writer.close()
index.close()
def query_index(self, query, max_hit=20):
query_array = query.split(' ')
index = SimpleFSDirectory(File(self.indexDir))
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)
booleanQuery = BooleanQuery()
for field in ['title', 'author']:
pq = PhraseQuery()
for value in query_array:
term = Term(field, value)
pq.add(term)
booleanQuery.add(TermQuery(term), BooleanClause.Occur.SHOULD)
if len(query_array) > 1:
booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
hits = searcher.search(booleanQuery, max_hit)
print "Found %d document(s) that matched query '%s':" % \
(hits.totalHits, booleanQuery)
result = []
for hit in hits.scoreDocs:
# print hit.score, hit.doc, hit.toString()
doc = searcher.doc(hit.doc)
tmp = {
'author': doc.get("author").encode("utf-8"),
'title': doc.get("title").encode("utf-8"),
'url': doc.get("url").encode("utf-8"),
'price': doc.get("price").encode("utf-8"),
}
result.append(tmp)
reader.close()
index.close()
return result
if __name__ == "__main__":
if sys.argv[1] == 'index':
indexer = AmazonIndexer()
indexer.index(sys.argv[2])
elif sys.argv[1] == 'query':
indexer = AmazonIndexer(override=False)
query = ' '.join(str(arg) for arg in sys.argv[2:]).lower()
search_result = indexer.query_index(query)
result_table = PrettyTable(["Price", "Author", "Title"])
result_table.align['Price'] = 'r'
result_table.align['Title'] = 'l'
result_table.align['Author'] = 'r'
for sr in search_result:
result_table.add_row(
['$' + sr['price'], sr['author'], sr['title']])
print result_table
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment