Skip to content

Instantly share code, notes, and snippets.

@ser1zw
Created December 7, 2012 19:14
Show Gist options
  • Save ser1zw/4235678 to your computer and use it in GitHub Desktop.
Save ser1zw/4235678 to your computer and use it in GitHub Desktop.
JavaでHTMLからテキストを取得するサンプル
package sample;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
public class HtmlParserCallback extends ParserCallback {
private StringBuffer buffer;
public HtmlParserCallback() {
buffer = new StringBuffer();
}
public String getFormattedText() {
return buffer.toString();
}
@Override
public void handleText(char[] data, int pos) {
buffer.append(data);
}
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
}
@Override
public void handleEndTag(Tag t, int pos) {
}
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
if (t.equals(Tag.BR)) {
buffer.append("\n");
}
}
}
package sample;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.StringReader;
/**
* JavaでHTMLからテキストを取得するサンプル
*
* @see http://docs.oracle.com/javase/jp/6/api/javax/swing/text/html/parser/ParserDelegator.html
* @see http://docs.oracle.com/javase/jp/6/api/javax/swing/text/html/HTMLEditorKit.ParserCallback.html
*/
public class HtmlParseSample {
public static void main(String[] args) {
String html = "<div>"
+ "hoge<br>"
+ " hoge"
+ "</div>"
+ "ほげほげ<br>"
+ " fugafuga<br>"
+ " <pre>"
+ " hoge<br>"
+ " fuga"
+ "</pre>"
+ "<script type=\"text/javascript\">"
+ " alert('ALERT!');"
+ "</script>"
+ "<br/>"
+ "あばばばば";
StringReader reader = new StringReader(html);
ParserDelegator parserDelegator = new ParserDelegator();
HtmlParserCallback callback = new HtmlParserCallback();
try {
parserDelegator.parse(reader, callback, true);
System.out.println(callback.getFormattedText());
}
catch (Exception e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment