Created
September 8, 2014 06:33
-
-
Save yurinnick/70daead1b2b5016000dd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lucene | |
from org.apache.lucene.store import SimpleFSDirectory | |
from org.apache.lucene.analysis import Analyzer | |
from org.apache.lucene.analysis.core import SimpleAnalyzer, LowerCaseFilter | |
from org.apache.lucene.analysis.synonym import SynonymFilter, \ | |
WordnetSynonymParser | |
from org.apache.lucene.analysis.standard import StandardAnalyzer, \ | |
ClassicTokenizer, StandardFilter | |
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer | |
from org.apache.lucene.document import Document, Field | |
from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery, \ | |
IndexSearcher, PhraseQuery | |
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, \ | |
IndexReader, Term | |
from org.apache.lucene.util import Version | |
from java.io import File, FileInputStream, InputStreamReader | |
import sys | |
import os | |
import json | |
from prettytable import PrettyTable | |
class SimpleAnalyzer(SimpleAnalyzer): | |
def __init__(self, *args): | |
super(Analyzer, self).__init__(*args) | |
def createComponents(self, fieldName, reader): | |
source = ClassicTokenizer(Version.LUCENE_47, reader) | |
token_filter = StandardFilter(Version.LUCENE_47, source) | |
token_filter = LowerCaseFilter(Version.LUCENE_47, token_filter) | |
mySynonymMap = self.buildSynonym() | |
print 'TEST' | |
synonym_filter = SynonymFilter(token_filter, mySynonymMap, False) | |
return Analyzer.TokenStreamComponents(source, synonym_filter) | |
def buildSynonym(self): | |
stream = FileInputStream(File("wn_s.pl")) | |
rulesReader = InputStreamReader(stream) | |
parser = WordnetSynonymParser( | |
True, True, StandardAnalyzer(Version.LUCENE_47)) | |
parser.add(rulesReader) | |
synonymMap = parser.build() | |
return synonymMap | |
class AmazonIndexer: | |
def __init__(self, override=True): | |
self.lucene = lucene.initVM() | |
self.indexDir = "/tmp/bookindex.index-dir/" | |
self.analyzer = SimpleAnalyzer(Version.LUCENE_47) | |
if override: | |
if os.path.isdir(self.indexDir): | |
for fileName in os.listdir(self.indexDir): | |
os.remove(self.indexDir + "/" + fileName) | |
else: | |
os.mkdir(self.indexDir) | |
def index(self, path): | |
index = SimpleFSDirectory(File(self.indexDir)) | |
indexWriterConfig = IndexWriterConfig( | |
Version.LUCENE_47, | |
LimitTokenCountAnalyzer(self.analyzer, 512)) | |
writer = IndexWriter(index, indexWriterConfig) | |
print >> sys.stderr, \ | |
"Currently there are %d documents in the index" % writer.numDocs() | |
print >> sys.stderr, "Reading lines from %s..." % path | |
for line in open(path).readlines(): | |
doc_json = json.loads(line) | |
doc = Document() | |
doc.add(Field( | |
'author', | |
doc_json['author'], | |
Field.Store.YES, | |
Field.Index.ANALYZED)) | |
doc.add(Field( | |
'title', | |
doc_json['title'], | |
Field.Store.YES, | |
Field.Index.ANALYZED)) | |
doc.add(Field( | |
'url', | |
doc_json['link'], | |
Field.Store.YES, | |
Field.Index.NO)) | |
doc.add(Field( | |
'price', | |
doc_json['price'], | |
Field.Store.YES, | |
Field.Index.NO)) | |
writer.addDocument(doc) | |
print >> sys.stderr, \ | |
"Indexed lines from %s (%d documents in index)" % \ | |
(path, writer.numDocs()) | |
print >> sys.stderr, \ | |
"About to optimize index of %d documents..." % writer.numDocs() | |
writer.forceMerge(1) | |
print >> sys.stderr, \ | |
"...done optimizing index of %d documents" % writer.numDocs() | |
print >> sys.stderr, \ | |
"Closing index of %d documents..." % writer.numDocs() | |
print >> sys.stderr, \ | |
"...done closing index of %d documents" % writer.numDocs() | |
writer.close() | |
index.close() | |
def query_index(self, query, max_hit=20): | |
query_array = query.split(' ') | |
index = SimpleFSDirectory(File(self.indexDir)) | |
reader = IndexReader.open(index) | |
searcher = IndexSearcher(reader) | |
n_docs = reader.numDocs() | |
print("Index contains %d documents." % n_docs) | |
booleanQuery = BooleanQuery() | |
for field in ['title', 'author']: | |
pq = PhraseQuery() | |
for value in query_array: | |
term = Term(field, value) | |
pq.add(term) | |
booleanQuery.add(TermQuery(term), BooleanClause.Occur.SHOULD) | |
if len(query_array) > 1: | |
booleanQuery.add(pq, BooleanClause.Occur.SHOULD) | |
hits = searcher.search(booleanQuery, max_hit) | |
print "Found %d document(s) that matched query '%s':" % \ | |
(hits.totalHits, booleanQuery) | |
result = [] | |
for hit in hits.scoreDocs: | |
# print hit.score, hit.doc, hit.toString() | |
doc = searcher.doc(hit.doc) | |
tmp = { | |
'author': doc.get("author").encode("utf-8"), | |
'title': doc.get("title").encode("utf-8"), | |
'url': doc.get("url").encode("utf-8"), | |
'price': doc.get("price").encode("utf-8"), | |
} | |
result.append(tmp) | |
reader.close() | |
index.close() | |
return result | |
if __name__ == "__main__": | |
if sys.argv[1] == 'index': | |
indexer = AmazonIndexer() | |
indexer.index(sys.argv[2]) | |
elif sys.argv[1] == 'query': | |
indexer = AmazonIndexer(override=False) | |
query = ' '.join(str(arg) for arg in sys.argv[2:]).lower() | |
search_result = indexer.query_index(query) | |
result_table = PrettyTable(["Price", "Author", "Title"]) | |
result_table.align['Price'] = 'r' | |
result_table.align['Title'] = 'l' | |
result_table.align['Author'] = 'r' | |
for sr in search_result: | |
result_table.add_row( | |
['$' + sr['price'], sr['author'], sr['title']]) | |
print result_table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment