yurinnick/AmazonIndexer

## AmazonIndexer
import lucene

from org.apache.lucene.store import SimpleFSDirectory

from org.apache.lucene.analysis import Analyzer
from org.apache.lucene.analysis.core import SimpleAnalyzer, LowerCaseFilter
from org.apache.lucene.analysis.synonym import SynonymFilter, \
    WordnetSynonymParser
from org.apache.lucene.analysis.standard import StandardAnalyzer, \
    ClassicTokenizer, StandardFilter
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer

from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery, \
    IndexSearcher, PhraseQuery
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, \
    IndexReader, Term
from org.apache.lucene.util import Version
from java.io import File, FileInputStream, InputStreamReader
import sys
import os
import json
from prettytable import PrettyTable


class SimpleAnalyzer(SimpleAnalyzer):
    def __init__(self, *args):
        super(Analyzer, self).__init__(*args)

    def createComponents(self, fieldName, reader):
        source = ClassicTokenizer(Version.LUCENE_47, reader)
        token_filter = StandardFilter(Version.LUCENE_47, source)
        token_filter = LowerCaseFilter(Version.LUCENE_47, token_filter)
        mySynonymMap = self.buildSynonym()
        print 'TEST'
        synonym_filter = SynonymFilter(token_filter, mySynonymMap, False)
        return Analyzer.TokenStreamComponents(source, synonym_filter)

    def buildSynonym(self):
        stream = FileInputStream(File("wn_s.pl"))
        rulesReader = InputStreamReader(stream)
        parser = WordnetSynonymParser(
            True, True, StandardAnalyzer(Version.LUCENE_47))
        parser.add(rulesReader)
        synonymMap = parser.build()
        return synonymMap


class AmazonIndexer:
    def __init__(self, override=True):
        self.lucene = lucene.initVM()
        self.indexDir = "/tmp/bookindex.index-dir/"
        self.analyzer = SimpleAnalyzer(Version.LUCENE_47)
        if override:
            if os.path.isdir(self.indexDir):
                for fileName in os.listdir(self.indexDir):
                    os.remove(self.indexDir + "/" + fileName)
            else:
                os.mkdir(self.indexDir)

    def index(self, path):
        index = SimpleFSDirectory(File(self.indexDir))
        indexWriterConfig = IndexWriterConfig(
            Version.LUCENE_47,
            LimitTokenCountAnalyzer(self.analyzer, 512))
        writer = IndexWriter(index, indexWriterConfig)

        print >> sys.stderr, \
            "Currently there are %d documents in the index" % writer.numDocs()

        print >> sys.stderr, "Reading lines from %s..." % path
        for line in open(path).readlines():
            doc_json = json.loads(line)
            doc = Document()
            doc.add(Field(
                'author',
                doc_json['author'],
                Field.Store.YES,
                Field.Index.ANALYZED))
            doc.add(Field(
                'title',
                doc_json['title'],
                Field.Store.YES,
                Field.Index.ANALYZED))
            doc.add(Field(
                'url',
                doc_json['link'],
                Field.Store.YES,
                Field.Index.NO))
            doc.add(Field(
                'price',
                doc_json['price'],
                Field.Store.YES,
                Field.Index.NO))
            writer.addDocument(doc)

        print >> sys.stderr, \
            "Indexed lines from %s (%d documents in index)" % \
            (path, writer.numDocs())
        print >> sys.stderr, \
            "About to optimize index of %d documents..." % writer.numDocs()
        writer.forceMerge(1)
        print >> sys.stderr, \
            "...done optimizing index of %d documents" % writer.numDocs()
        print >> sys.stderr, \
            "Closing index of %d documents..." % writer.numDocs()
        print >> sys.stderr, \
            "...done closing index of %d documents" % writer.numDocs()
        writer.close()
        index.close()

    def query_index(self, query, max_hit=20):
        query_array = query.split(' ')

        index = SimpleFSDirectory(File(self.indexDir))
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)
        n_docs = reader.numDocs()

        print("Index contains %d documents." % n_docs)

        booleanQuery = BooleanQuery()
        for field in ['title', 'author']:
            pq = PhraseQuery()
            for value in query_array:
                term = Term(field, value)
                pq.add(term)
                booleanQuery.add(TermQuery(term), BooleanClause.Occur.SHOULD)
            if len(query_array) > 1:
                booleanQuery.add(pq, BooleanClause.Occur.SHOULD)

        hits = searcher.search(booleanQuery, max_hit)
        print "Found %d document(s) that matched query '%s':" % \
            (hits.totalHits, booleanQuery)

        result = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            tmp = {
                'author': doc.get("author").encode("utf-8"),
                'title': doc.get("title").encode("utf-8"),
                'url': doc.get("url").encode("utf-8"),
                'price': doc.get("price").encode("utf-8"),
            }
            result.append(tmp)
        reader.close()
        index.close()

        return result

if __name__ == "__main__":
    if sys.argv[1] == 'index':
        indexer = AmazonIndexer()
        indexer.index(sys.argv[2])
    elif sys.argv[1] == 'query':
        indexer = AmazonIndexer(override=False)
        query = ' '.join(str(arg) for arg in sys.argv[2:]).lower()
        search_result = indexer.query_index(query)
        result_table = PrettyTable(["Price", "Author", "Title"])
        result_table.align['Price'] = 'r'
        result_table.align['Title'] = 'l'
        result_table.align['Author'] = 'r'
        for sr in search_result:
            result_table.add_row(
                ['$' + sr['price'], sr['author'], sr['title']])
        print result_table
	import lucene

	from org.apache.lucene.store import SimpleFSDirectory

	from org.apache.lucene.analysis import Analyzer
	from org.apache.lucene.analysis.core import SimpleAnalyzer, LowerCaseFilter
	from org.apache.lucene.analysis.synonym import SynonymFilter, \
	WordnetSynonymParser
	from org.apache.lucene.analysis.standard import StandardAnalyzer, \
	ClassicTokenizer, StandardFilter
	from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer

	from org.apache.lucene.document import Document, Field
	from org.apache.lucene.search import BooleanClause, BooleanQuery, TermQuery, \
	IndexSearcher, PhraseQuery
	from org.apache.lucene.index import IndexWriter, IndexWriterConfig, \
	IndexReader, Term
	from org.apache.lucene.util import Version
	from java.io import File, FileInputStream, InputStreamReader
	import sys
	import os
	import json
	from prettytable import PrettyTable


	class SimpleAnalyzer(SimpleAnalyzer):
	def __init__(self, *args):
	super(Analyzer, self).__init__(*args)

	def createComponents(self, fieldName, reader):
	source = ClassicTokenizer(Version.LUCENE_47, reader)
	token_filter = StandardFilter(Version.LUCENE_47, source)
	token_filter = LowerCaseFilter(Version.LUCENE_47, token_filter)
	mySynonymMap = self.buildSynonym()
	print 'TEST'
	synonym_filter = SynonymFilter(token_filter, mySynonymMap, False)
	return Analyzer.TokenStreamComponents(source, synonym_filter)

	def buildSynonym(self):
	stream = FileInputStream(File("wn_s.pl"))
	rulesReader = InputStreamReader(stream)
	parser = WordnetSynonymParser(
	True, True, StandardAnalyzer(Version.LUCENE_47))
	parser.add(rulesReader)
	synonymMap = parser.build()
	return synonymMap


	class AmazonIndexer:
	def __init__(self, override=True):
	self.lucene = lucene.initVM()
	self.indexDir = "/tmp/bookindex.index-dir/"
	self.analyzer = SimpleAnalyzer(Version.LUCENE_47)
	if override:
	if os.path.isdir(self.indexDir):
	for fileName in os.listdir(self.indexDir):
	os.remove(self.indexDir + "/" + fileName)
	else:
	os.mkdir(self.indexDir)

	def index(self, path):
	index = SimpleFSDirectory(File(self.indexDir))
	indexWriterConfig = IndexWriterConfig(
	Version.LUCENE_47,
	LimitTokenCountAnalyzer(self.analyzer, 512))
	writer = IndexWriter(index, indexWriterConfig)

	print >> sys.stderr, \
	"Currently there are %d documents in the index" % writer.numDocs()

	print >> sys.stderr, "Reading lines from %s..." % path
	for line in open(path).readlines():
	doc_json = json.loads(line)
	doc = Document()
	doc.add(Field(
	'author',
	doc_json['author'],
	Field.Store.YES,
	Field.Index.ANALYZED))
	doc.add(Field(
	'title',
	doc_json['title'],
	Field.Store.YES,
	Field.Index.ANALYZED))
	doc.add(Field(
	'url',
	doc_json['link'],
	Field.Store.YES,
	Field.Index.NO))
	doc.add(Field(
	'price',
	doc_json['price'],
	Field.Store.YES,
	Field.Index.NO))
	writer.addDocument(doc)

	print >> sys.stderr, \
	"Indexed lines from %s (%d documents in index)" % \
	(path, writer.numDocs())
	print >> sys.stderr, \
	"About to optimize index of %d documents..." % writer.numDocs()
	writer.forceMerge(1)
	print >> sys.stderr, \
	"...done optimizing index of %d documents" % writer.numDocs()
	print >> sys.stderr, \
	"Closing index of %d documents..." % writer.numDocs()
	print >> sys.stderr, \
	"...done closing index of %d documents" % writer.numDocs()
	writer.close()
	index.close()

	def query_index(self, query, max_hit=20):
	query_array = query.split(' ')

	index = SimpleFSDirectory(File(self.indexDir))
	reader = IndexReader.open(index)
	searcher = IndexSearcher(reader)
	n_docs = reader.numDocs()

	print("Index contains %d documents." % n_docs)

	booleanQuery = BooleanQuery()
	for field in ['title', 'author']:
	pq = PhraseQuery()
	for value in query_array:
	term = Term(field, value)
	pq.add(term)
	booleanQuery.add(TermQuery(term), BooleanClause.Occur.SHOULD)
	if len(query_array) > 1:
	booleanQuery.add(pq, BooleanClause.Occur.SHOULD)

	hits = searcher.search(booleanQuery, max_hit)
	print "Found %d document(s) that matched query '%s':" % \
	(hits.totalHits, booleanQuery)

	result = []
	for hit in hits.scoreDocs:
	# print hit.score, hit.doc, hit.toString()
	doc = searcher.doc(hit.doc)
	tmp = {
	'author': doc.get("author").encode("utf-8"),
	'title': doc.get("title").encode("utf-8"),
	'url': doc.get("url").encode("utf-8"),
	'price': doc.get("price").encode("utf-8"),
	}
	result.append(tmp)
	reader.close()
	index.close()

	return result

	if __name__ == "__main__":
	if sys.argv[1] == 'index':
	indexer = AmazonIndexer()
	indexer.index(sys.argv[2])
	elif sys.argv[1] == 'query':
	indexer = AmazonIndexer(override=False)
	query = ' '.join(str(arg) for arg in sys.argv[2:]).lower()
	search_result = indexer.query_index(query)
	result_table = PrettyTable(["Price", "Author", "Title"])
	result_table.align['Price'] = 'r'
	result_table.align['Title'] = 'l'
	result_table.align['Author'] = 'r'
	for sr in search_result:
	result_table.add_row(
	['$' + sr['price'], sr['author'], sr['title']])
	print result_table