Last active
April 11, 2016 10:59
-
-
Save milindjagre/e6c51d16bf6a8d21e7f6810042cd27c9 to your computer and use it in GitHub Desktop.
This java class is responsible for parsing Microsoft Word Document data word by word. It will accept InputStream Object of filepath and emit the same data by appending new line once it reaches new line.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* To change this license header, choose License Headers in Project Properties. | |
* To change this template file, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package com.milind.mr.worddoc; | |
/** | |
* | |
* @author milind | |
*/ | |
import java.io.IOException; | |
import java.io.InputStream; | |
import org.apache.commons.logging.Log; | |
import org.apache.commons.logging.LogFactory; | |
import org.apache.poi.hwpf.HWPFDocument; | |
import org.apache.poi.hwpf.extractor.WordExtractor; | |
public class WordParser { | |
private static final Log LOG = LogFactory.getLog(WordParser.class); | |
private StringBuilder currentString = null; | |
private long bytesRead = 0; | |
public String parseExcelData(InputStream is) { | |
try { | |
HWPFDocument doc = new HWPFDocument(is); | |
WordExtractor we = new WordExtractor(doc); | |
String[] paragraphs = we.getParagraphText(); | |
currentString = new StringBuilder(); | |
for (String para : paragraphs) { | |
currentString.append(para + "\n"); | |
} | |
is.close(); | |
} catch (IOException e) { | |
LOG.error("IO Exception : File not found " + e); | |
} | |
return currentString.toString(); | |
} | |
public long getBytesRead() { | |
return bytesRead; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment