Skip to content

Instantly share code, notes, and snippets.

@bholzer
Created September 15, 2015 07:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bholzer/71ef406d91c183a1bd49 to your computer and use it in GitHub Desktop.
Save bholzer/71ef406d91c183a1bd49 to your computer and use it in GitHub Desktop.
/**
* Returns a text from html file content without user's tags and their bodies.
*
* @param is an input stream with html file content.
* @return The string only with text from file content.
*/
public String getContentAsText(InputStream is) throws IOException, DocumentReadException
{
if (is == null)
{
throw new IllegalArgumentException("InputStream is null.");
}
String refined_text = new String();
try
{
byte[] buffer = new byte[2048];
int len;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((len = is.read(buffer)) > 0)
{
bos.write(buffer, 0, len);
}
bos.close();
String html = new String(bos.toByteArray());
Parser parser = Parser.createParser(html, null);
StringBean sb = new StringBean();
// read links or not
// sb.setLinks(true);
// extract text
parser.visitAllNodesWith(sb);
String text = sb.getStrings();
refined_text = (text != null) ? text : ""; // delete(text);
}
catch (ParserException e)
{
throw new DocumentReadException(e.getMessage(), e);
}
finally
{
if (is != null)
{
try
{
is.close();
}
catch (IOException e)
{
if (LOG.isTraceEnabled())
{
LOG.trace("An exception occurred: " + e.getMessage());
}
}
}
}
return refined_text;
}
public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException
{
// Ignore encoding
return getContentAsText(is);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment