Created
February 20, 2019 09:35
-
-
Save LiWenGu/6f3948685f238951db8675824248c8df to your computer and use it in GitHub Desktop.
[lucene]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @Author liwenguang | |
* @Date 2019-02-20 15:40 | |
* @Description 基于 tika 解析文件内容为字符串,使用 lucene 建立索引 | |
*/ | |
@Slf4j | |
public class CreateIndex { | |
public static final String INDEX_PATH = "hello_lucene/src/main/resources/"; | |
public static List<FileModel> extractFile() throws IOException { | |
ArrayList<FileModel> list = new ArrayList<>(); | |
File fileDir = new File(INDEX_PATH + "files"); | |
File[] allFiles = fileDir.listFiles(); | |
for (File f : allFiles) { | |
FileModel sf = new FileModel(f.getName(), ParserExtraction(f)); | |
list.add(sf); | |
} | |
return list; | |
} | |
private static String ParserExtraction(File file) { | |
// 接收文档内容 | |
String fileContent = ""; | |
BodyContentHandler handler = new BodyContentHandler(); | |
Parser parser = new AutoDetectParser(); | |
Metadata metadata = new Metadata(); | |
FileInputStream inputStream; | |
try { | |
inputStream = new FileInputStream(file); | |
ParseContext context = new ParseContext(); | |
parser.parse(inputStream, handler, metadata, context); | |
fileContent = handler.toString(); | |
} catch (Exception e) { | |
log.error("异常:" + e.getMessage(), e); | |
} | |
return fileContent; | |
} | |
public static void main(String[] args) throws IOException { | |
Analyzer analyzer = new IKAnalyzer6x(); | |
IndexWriterConfig icw = new IndexWriterConfig(analyzer); | |
icw.setOpenMode(IndexWriterConfig.OpenMode.CREATE); | |
Directory dir = null; | |
IndexWriter indexWriter = null; | |
Path indexPath = Paths.get(INDEX_PATH + "indexdir"); | |
FieldType fieldType = new FieldType(); | |
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); | |
fieldType.setStored(true); | |
fieldType.setTokenized(true); | |
fieldType.setStoreTermVectors(true); | |
fieldType.setStoreTermVectorPositions(true); | |
fieldType.setStoreTermVectorOffsets(true); | |
Date start = new Date(); | |
if (!Files.isReadable(indexPath)) { | |
System.out.println("Document directory '" + indexPath.toAbsolutePath() + "' does not exist or is not readable, please check the path"); | |
System.exit(1); | |
} | |
dir = FSDirectory.open(indexPath); | |
indexWriter = new IndexWriter(dir, icw); | |
ArrayList<FileModel> fileList = (ArrayList<FileModel>) extractFile(); | |
for (FileModel f : fileList) { | |
Document doc = new Document(); | |
doc.add(new Field("title", f.getTitle(), fieldType)); | |
doc.add(new Field("content", f.getContent(), fieldType)); | |
indexWriter.addDocument(doc); | |
} | |
indexWriter.commit(); | |
indexWriter.close(); | |
dir.close(); | |
Date end = new Date(); | |
log.info("索引文档完成,共耗时:" + (end.getTime() - start.getTime()) + "毫秒。"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment