Created
October 10, 2015 14:04
-
-
Save ColadaFF/1d6557ebaa147753bc9f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.*; | |
import org.apache.lucene.index.FieldInfo; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.Version; | |
import org.json.simple.JSONArray; | |
import org.json.simple.JSONObject; | |
import org.json.simple.JSONValue; | |
import java.io.*; | |
import java.util.List; | |
import java.util.Set; | |
public class LuceneIndexWriter { | |
static final String INDEX_PATH = "indexDir"; | |
static final String JSON_FILE_PATH = "docs.json"; | |
static final String STOPWORDS_FILE_PATH = "stopwords.txt"; | |
String indexPath; | |
String jsonFilePath; | |
IndexWriter indexWriter = null; | |
public LuceneIndexWriter(String indexPath, String jsonFilePath) { | |
this.indexPath = indexPath; | |
this.jsonFilePath = jsonFilePath; | |
} | |
public void createIndex() throws FileNotFoundException { | |
JSONArray jsonObjects = parseJSONFile(); | |
openIndex(); | |
addDocuments(jsonObjects); | |
finish(); | |
} | |
public JSONArray parseJSONFile() throws FileNotFoundException { | |
InputStream jsonFile = new FileInputStream(jsonFilePath); | |
Reader readerJson = new InputStreamReader(jsonFile); | |
//Parse the json file using simple-json library | |
Object fileObjects = JSONValue.parse(readerJson); | |
JSONArray arrayObjects = (JSONArray) fileObjects; | |
return arrayObjects; | |
} | |
public boolean openIndex() { | |
try { | |
InputStream stopWords = new FileInputStream(STOPWORDS_FILE_PATH); | |
Reader readerStopWords = new InputStreamReader(stopWords); | |
Directory dir = FSDirectory.open(new File(indexPath)); | |
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46, readerStopWords); | |
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_46, analyzer); | |
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); | |
indexWriter = new IndexWriter(dir, iwc); | |
return true; | |
} catch (Exception e) { | |
System.err.println("Error opening the index. " + e.getMessage()); | |
} | |
return false; | |
} | |
/** | |
* Add documents to the index | |
*/ | |
public void addDocuments(JSONArray jsonObjects) { | |
for (JSONObject object : (List<JSONObject>) jsonObjects) { | |
Document doc = new Document(); | |
final FieldType bodyOptions = new FieldType(); | |
bodyOptions.setIndexed(true); | |
bodyOptions.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); | |
bodyOptions.setStored(true); | |
bodyOptions.setStoreTermVectors(true); | |
bodyOptions.setTokenized(true); | |
for (String field : (Set<String>) object.keySet()) { | |
doc.add(new Field(field, (String) object.get(field), bodyOptions)); | |
} | |
try { | |
System.out.println(doc); | |
indexWriter.addDocument(doc); | |
} catch (IOException ex) { | |
System.err.println("Error adding documents to the index. " + ex.getMessage()); | |
} | |
} | |
} | |
/** | |
* Write the document to the index and close it | |
*/ | |
public void finish() { | |
try { | |
indexWriter.commit(); | |
indexWriter.close(); | |
} catch (IOException ex) { | |
System.err.println("We had a problem closing the index: " + ex.getMessage()); | |
} | |
} | |
public static void main(String[] args) throws FileNotFoundException { | |
LuceneIndexWriter liw = new LuceneIndexWriter(INDEX_PATH, JSON_FILE_PATH); | |
liw.createIndex(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment