Skip to content

Instantly share code, notes, and snippets.

@milindjagre
Last active April 11, 2016 10:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milindjagre/e6c51d16bf6a8d21e7f6810042cd27c9 to your computer and use it in GitHub Desktop.
Save milindjagre/e6c51d16bf6a8d21e7f6810042cd27c9 to your computer and use it in GitHub Desktop.
This java class is responsible for parsing Microsoft Word Document data word by word. It will accept InputStream Object of filepath and emit the same data by appending new line once it reaches new line.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.milind.mr.worddoc;
/**
*
* @author milind
*/
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class WordParser {
private static final Log LOG = LogFactory.getLog(WordParser.class);
private StringBuilder currentString = null;
private long bytesRead = 0;
public String parseExcelData(InputStream is) {
try {
HWPFDocument doc = new HWPFDocument(is);
WordExtractor we = new WordExtractor(doc);
String[] paragraphs = we.getParagraphText();
currentString = new StringBuilder();
for (String para : paragraphs) {
currentString.append(para + "\n");
}
is.close();
} catch (IOException e) {
LOG.error("IO Exception : File not found " + e);
}
return currentString.toString();
}
public long getBytesRead() {
return bytesRead;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment