Created
September 15, 2015 07:00
-
-
Save bholzer/71ef406d91c183a1bd49 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Returns a text from html file content without user's tags and their bodies. | |
* | |
* @param is an input stream with html file content. | |
* @return The string only with text from file content. | |
*/ | |
public String getContentAsText(InputStream is) throws IOException, DocumentReadException | |
{ | |
if (is == null) | |
{ | |
throw new IllegalArgumentException("InputStream is null."); | |
} | |
String refined_text = new String(); | |
try | |
{ | |
byte[] buffer = new byte[2048]; | |
int len; | |
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |
while ((len = is.read(buffer)) > 0) | |
{ | |
bos.write(buffer, 0, len); | |
} | |
bos.close(); | |
String html = new String(bos.toByteArray()); | |
Parser parser = Parser.createParser(html, null); | |
StringBean sb = new StringBean(); | |
// read links or not | |
// sb.setLinks(true); | |
// extract text | |
parser.visitAllNodesWith(sb); | |
String text = sb.getStrings(); | |
refined_text = (text != null) ? text : ""; // delete(text); | |
} | |
catch (ParserException e) | |
{ | |
throw new DocumentReadException(e.getMessage(), e); | |
} | |
finally | |
{ | |
if (is != null) | |
{ | |
try | |
{ | |
is.close(); | |
} | |
catch (IOException e) | |
{ | |
if (LOG.isTraceEnabled()) | |
{ | |
LOG.trace("An exception occurred: " + e.getMessage()); | |
} | |
} | |
} | |
} | |
return refined_text; | |
} | |
public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException | |
{ | |
// Ignore encoding | |
return getContentAsText(is); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment