Skip to content

Instantly share code, notes, and snippets.

@ansjsun
Created July 16, 2013 03:21
Show Gist options
  • Save ansjsun/6005498 to your computer and use it in GitHub Desktop.
Save ansjsun/6005498 to your computer and use it in GitHub Desktop.
html文章正文抽取类.计算正文
package com.kuyun.nlp;
import java.util.ArrayList;
import java.util.List;
import com.kuyun.nlp.util.PageDown;
public class HtmlExtraction {
public static void main(String[] args) {
String html = PageDown
.getHtml("http://sports.sina.com.cn/g/pl/2013-07-15/12276671936.shtml");
// String html = "<aaa>bbbb<aaaa><aaa>bbbb<aaaa><aaa>bbbb<aaaa><aaa>bbbb<aaaa>" ;
long start = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
paser(html);
}
System.out.println(System.currentTimeMillis() - start);
}
public static void paser(String html) {
html = html.replaceAll("\\s+", " ").replaceAll("&lt;", "<").replaceAll("&gt;", ">")
.replaceAll("&nbsp;", " ").replaceAll("&[a-zA-Z]{2,5};", " ")
.replaceAll("(?is)<wbr\\s*/>", "").replaceAll("(?is)<!--.*?-->", "")
.replaceAll("(?is)<script.*?>.*?</script>", "")
.replaceAll("(?is)<style.*?>.*?</style>", "")
.replaceAll("(?is)(<a.*?>.*?</a>\\s*[^<]{0,6}\\s*(<[^a].*?>[\\|/]?\\s*)*){2,}+", "");
;
List<Node> list = new ArrayList<HtmlExtraction.Node>();
html = html.toLowerCase();
int start = html.indexOf("<bod");
if (start == -1)
start = 0;
else
start = findRightindex(start, html);
int length = html.length();
char c = 0;
Node node = new Node();
int right;
boolean flag = false;
for (int i = start; i < length; i++) {
c = html.charAt(i);
if (Character.isWhitespace(c)) {
continue;
}
if (c == '<') {
right = findRightindex(i, html);
node.updateLength(right - i);
i = right;
flag = true;
} else {
if (flag == true) {
list.add(node);
node = new Node();
flag = false;
}
node.append(c);
}
}
list.add(node);
findMaxPath(list);
}
private static void findMaxPath(List<Node> list) {
for (Node node : list) {
System.out.println(node.score+"\t"+node.text);
}
// TODO Auto-generated method stub
Node from = null;
int maxScore = 0;
int index = 0;
int tempScore = 0;
Node node = null;
for (int i = 0; i < list.size(); i++) {
node = list.get(i);
if (from == null) {
from = node;
} else {
tempScore = node.walk(from);
if (maxScore < tempScore) {
maxScore = tempScore;
index = i;
}
from = node;
}
}
node = list.get(index);
while (node.maxFrom != node) {
node.maxFrom.to = node;
node = node.maxFrom;
}
StringBuilder sb = new StringBuilder(node.text);
while ((node = node.to) != null) {
sb.append(node.text);
}
System.out.println(sb);
}
private static int findRightindex(int start, String html) {
// TODO Auto-generated method stub
int length = html.length();
for (; start < length; start++) {
if (html.charAt(start) == '>')
return start;
}
return 0;
}
static class Node {
private StringBuilder text = new StringBuilder();
private int tagLength;
private int score;
private Node maxFrom;
private Node to;
public void updateLength(int length) {
tagLength += length;
}
public void append(char c) {
text.append(c);
this.score++;
}
public int walk(Node from) {
if (from.score + this.score - from.tagLength > this.score) {
this.maxFrom = from;
this.score = from.score + this.score - from.tagLength;
} else {
this.maxFrom = this;
}
return this.score;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment