bholzer/lol.java

## lol.java
 /**
    * Returns a text from html file content without user's tags and their bodies.
    *
    * @param is an input stream with html file content.
    * @return The string only with text from file content.
    */
   public String getContentAsText(InputStream is) throws IOException, DocumentReadException
   {
      if (is == null)
      {
         throw new IllegalArgumentException("InputStream is null.");
      }

      String refined_text = new String();
      try
      {
         byte[] buffer = new byte[2048];
         int len;
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
         while ((len = is.read(buffer)) > 0)
         {
            bos.write(buffer, 0, len);
         }
         bos.close();

         String html = new String(bos.toByteArray());

         Parser parser = Parser.createParser(html, null);
         StringBean sb = new StringBean();

         // read links or not
         // sb.setLinks(true);

         // extract text
         parser.visitAllNodesWith(sb);

         String text = sb.getStrings();
         refined_text = (text != null) ? text : ""; // delete(text);

      }
      catch (ParserException e)
      {
         throw new DocumentReadException(e.getMessage(), e);
      }
      finally
      {
         if (is != null)
         {
            try
            {
               is.close();
            }
            catch (IOException e)
            {
               if (LOG.isTraceEnabled())
               {
                  LOG.trace("An exception occurred: " + e.getMessage());
               }
            }
         }
      }

      return refined_text;
   }

   public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException
   {
      // Ignore encoding
      return getContentAsText(is);
   }
	/**
	* Returns a text from html file content without user's tags and their bodies.
	*
	* @param is an input stream with html file content.
	* @return The string only with text from file content.
	*/
	public String getContentAsText(InputStream is) throws IOException, DocumentReadException
	{
	if (is == null)
	{
	throw new IllegalArgumentException("InputStream is null.");
	}

	String refined_text = new String();
	try
	{
	byte[] buffer = new byte[2048];
	int len;
	ByteArrayOutputStream bos = new ByteArrayOutputStream();
	while ((len = is.read(buffer)) > 0)
	{
	bos.write(buffer, 0, len);
	}
	bos.close();

	String html = new String(bos.toByteArray());

	Parser parser = Parser.createParser(html, null);
	StringBean sb = new StringBean();

	// read links or not
	// sb.setLinks(true);

	// extract text
	parser.visitAllNodesWith(sb);

	String text = sb.getStrings();
	refined_text = (text != null) ? text : ""; // delete(text);

	}
	catch (ParserException e)
	{
	throw new DocumentReadException(e.getMessage(), e);
	}
	finally
	{
	if (is != null)
	{
	try
	{
	is.close();
	}
	catch (IOException e)
	{
	if (LOG.isTraceEnabled())
	{
	LOG.trace("An exception occurred: " + e.getMessage());
	}
	}
	}
	}

	return refined_text;
	}

	public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException
	{
	// Ignore encoding
	return getContentAsText(is);
	}