Skip to content

Instantly share code, notes, and snippets.

@LiWenGu
Created February 20, 2019 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LiWenGu/6f3948685f238951db8675824248c8df to your computer and use it in GitHub Desktop.
Save LiWenGu/6f3948685f238951db8675824248c8df to your computer and use it in GitHub Desktop.
[lucene]
/**
* @Author liwenguang
* @Date 2019-02-20 15:40
* @Description 基于 tika 解析文件内容为字符串,使用 lucene 建立索引
*/
@Slf4j
public class CreateIndex {
public static final String INDEX_PATH = "hello_lucene/src/main/resources/";
public static List<FileModel> extractFile() throws IOException {
ArrayList<FileModel> list = new ArrayList<>();
File fileDir = new File(INDEX_PATH + "files");
File[] allFiles = fileDir.listFiles();
for (File f : allFiles) {
FileModel sf = new FileModel(f.getName(), ParserExtraction(f));
list.add(sf);
}
return list;
}
private static String ParserExtraction(File file) {
// 接收文档内容
String fileContent = "";
BodyContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
FileInputStream inputStream;
try {
inputStream = new FileInputStream(file);
ParseContext context = new ParseContext();
parser.parse(inputStream, handler, metadata, context);
fileContent = handler.toString();
} catch (Exception e) {
log.error("异常:" + e.getMessage(), e);
}
return fileContent;
}
public static void main(String[] args) throws IOException {
Analyzer analyzer = new IKAnalyzer6x();
IndexWriterConfig icw = new IndexWriterConfig(analyzer);
icw.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
Directory dir = null;
IndexWriter indexWriter = null;
Path indexPath = Paths.get(INDEX_PATH + "indexdir");
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorOffsets(true);
Date start = new Date();
if (!Files.isReadable(indexPath)) {
System.out.println("Document directory '" + indexPath.toAbsolutePath() + "' does not exist or is not readable, please check the path");
System.exit(1);
}
dir = FSDirectory.open(indexPath);
indexWriter = new IndexWriter(dir, icw);
ArrayList<FileModel> fileList = (ArrayList<FileModel>) extractFile();
for (FileModel f : fileList) {
Document doc = new Document();
doc.add(new Field("title", f.getTitle(), fieldType));
doc.add(new Field("content", f.getContent(), fieldType));
indexWriter.addDocument(doc);
}
indexWriter.commit();
indexWriter.close();
dir.close();
Date end = new Date();
log.info("索引文档完成,共耗时:" + (end.getTime() - start.getTime()) + "毫秒。");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment