Created
May 29, 2018 15:43
-
-
Save enginebai/b11d9a79fe74e88bee23ca7d922c98bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static BlockProperties parseWebContent(Parser parser) throws ParserException { | |
NodeList visualBlockNodeList = getVisualBlock(parser); | |
// Output.printNodeList( visualBlockNodeList ); | |
NodeList linkNodeList = findLinkBlock(visualBlockNodeList); | |
NodeList invalidNodeList = findInvalidBlock(visualBlockNodeList); | |
NodeList actionNodeList = findActionBlock(visualBlockNodeList); // NOTE: 因為動作標籤可能沒有包含文字,所以要獨立出來找 | |
Map<String, NodeList> blockNodeMap = new HashMap<String, NodeList>(); | |
blockNodeMap.put(VISUAL_BLOCK, visualBlockNodeList); | |
blockNodeMap.put(LINK_BLOCK, linkNodeList); | |
blockNodeMap.put(INVALID_BLOCK, invalidNodeList); | |
blockNodeMap.put(ACTION_BLOCK, actionNodeList); | |
List<BlockProperties> blockPropertiesList = getBlockProperties(blockNodeMap); | |
Map<BlockProperties, Double> propMap = new TreeMap<BlockProperties, Double>(); | |
for (int i = 0; i < blockPropertiesList.size(); i++) { | |
BlockProperties blockProp = blockPropertiesList.get(i); | |
// /* | |
if (blockProp.getProperties().equals(NORMAL_BLOCK) && blockProp.getBlockText().length() <= 0) | |
continue; | |
// v1: 0.3 | |
else if ((blockProp.getProperties().equals(LINK_BLOCK) && blockProp.getBlockTextRatio() >= 0.4) || | |
blockProp.getSubLinkTextRatio() >= 0.45) | |
continue; | |
// v1: 0.4 | |
else if ((blockProp.getProperties().equals(INVALID_BLOCK) && blockProp.getBlockTextRatio() >= 0.65) || | |
blockProp.getSubInvalidTextRatio() >= 0.4) | |
continue; | |
else if (blockProp.getProperties().equals(ACTION_BLOCK) || blockProp.getSubActionBlock() > 0) | |
continue; | |
// */ | |
// blockProp.print(); | |
// 找出正文區塊的計算特徵值公式 | |
double linkInvalidTextLen = blockProp.getSubInvalidTextLength() + blockProp.getSubLinkTextLength(); | |
double normalTextLen = blockProp.getBlockText().length() * (1.0 - blockProp.getBlockTextRatio()) - linkInvalidTextLen; | |
if (linkInvalidTextLen <= 0) | |
linkInvalidTextLen = 1.0; // 為了除法,如果等於零要轉成1 | |
double linkInvalidSubBlockNum = blockProp.getSubLinkBlock() + blockProp.getSubInvalidBlock(); | |
double normalSubBlock = (double) (blockProp.getSubBlockNum() - linkInvalidSubBlockNum); | |
if (linkInvalidSubBlockNum <= 0) | |
linkInvalidSubBlockNum = 1.0; | |
double weight = Math.pow(normalTextLen, 5) / (double) blockProp.getBlockText().length(); | |
weight /= Math.pow(10.0, 5); | |
// 子區块都是連結或是無效區块 | |
if (normalSubBlock == 0 && blockProp.getSubBlockNum() != 0) | |
weight /= (10.0 * Math.pow(blockProp.getSubBlockNum(), 2)); | |
else if (normalSubBlock != 0 && blockProp.getSubBlockNum() != 0) | |
weight *= (normalSubBlock / Math.pow(blockProp.getSubBlockNum(), 2)); | |
if (blockProp.getProperties().equals(NORMAL_BLOCK)) | |
weight *= 3.0; | |
else if (blockProp.getProperties().equals(INVALID_BLOCK)) | |
weight *= 1.2; | |
else if (blockProp.getProperties().equals(LINK_BLOCK)) | |
weight *= 1.8; | |
// 由視覺區块的class或id來判斷,包含article和content的字眼可提高權重值 | |
CompositeTag blockTag = (CompositeTag) blockProp.getBlockNode(); | |
String className = blockTag.getAttribute("class"); | |
String idName = blockTag.getAttribute("id"); | |
String checkName = null; | |
if (idName != null) | |
checkName = idName.toLowerCase(); | |
else if (className != null) | |
checkName = className.toLowerCase(); | |
if (checkName != null) { | |
checkName = checkName.toLowerCase().trim(); | |
if (!(checkName.contains("footer") || | |
checkName.contains("header") || | |
checkName.contains("counter") || | |
checkName.contains("banner")) || | |
checkName.contains("widget")) { | |
// System.out.println( "*Weight=" + weight ); | |
if ((checkName.contains("body") && | |
checkName.contains("post")) || | |
(checkName.contains("entry") && | |
checkName.contains("content")) || | |
checkName.contains("innertext") || | |
(checkName.contains("content") && | |
checkName.contains("article"))) | |
weight *= 1000.0; | |
else if (checkName.contains("content")) | |
weight *= 50.0; | |
else if (checkName.contains("article")) | |
weight *= 10.0; | |
else if (checkName.contains("text")) | |
weight *= 5.0; | |
// 有id的再加分 | |
if (idName != null) | |
weight *= 100.0; | |
// System.out.println( "Weight'=" + weight ); | |
} | |
} | |
propMap.put(blockProp, weight); | |
// System.out.println( "\t*Weight=" + weight ); | |
propMap = MapUtils.sortByValue(propMap, true); | |
} | |
int count = 0; // 為了取得第一個BlockProperties用的 | |
BlockProperties contentProp = null; | |
int commentIndex = 0; // 用來儲存回應的區塊索引,所有在回應以下的區塊都不能成為正文區块 | |
for (BlockProperties prop : propMap.keySet()) { | |
if (prop.getBlockText().length() <= 0) | |
continue; | |
try { | |
// 用來去掉回應區塊 | |
CompositeTag propNode = (CompositeTag) prop.getBlockNode(); | |
String className = propNode.getAttribute("class"); | |
String idName = propNode.getAttribute("id"); | |
String checkName = null; | |
if (idName != null) | |
checkName = idName; | |
else if (className != null) | |
checkName = className; | |
if (checkName != null) { | |
if (checkName.contains("comment") || checkName.contains("reply")) { | |
commentIndex = blockPropertiesList.indexOf(prop); | |
continue; | |
} else if (containTrimClassID(propNode)) | |
continue; | |
} | |
} | |
catch (NullPointerException e) { | |
e.printStackTrace(); | |
} | |
if (commentIndex != 0 && blockPropertiesList.indexOf(prop) > commentIndex) | |
continue; | |
if (count == 0) { | |
// System.out.println( "\n\n--> Wegith= " + propMap.get( prop ) ); prop.print(); | |
contentProp = prop; | |
count++; | |
} else | |
break; | |
} | |
// 找到正文區塊還有其子區块,判斷是否包含連結區塊或是特定class, id名稱,然後過濾掉。 | |
Node currentNode = contentProp.getBlockNode(); | |
String contentHtml = | |
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
// currentNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
String contentText = WebCrawler.filterSpecialSymbol( | |
org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
// currentNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ); | |
// System.out.println( contentText.length() ); | |
String checkContentHtml = contentHtml; // check開頭的變數是給迴圈判斷正文區塊的子區塊用的 | |
String checkContentText = contentText; // 因為在迴圈中contentHtml和contentText的字串會變動,所以無法拿來判斷子區塊 | |
Map<String, Integer> trimTextMap = new TreeMap<String, Integer>(); | |
for (int i = visualBlockNodeList.indexOf(contentProp.getBlockNode()) + 1; | |
i < visualBlockNodeList.size(); i++ | |
) { | |
CompositeTag nextNode = (CompositeTag) visualBlockNodeList.elementAt(i); | |
String nextHtml = | |
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
// nextNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
String nextText = WebCrawler.filterSpecialSymbol( | |
org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
// nextNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ).trim(); | |
if (checkContentHtml.contains(nextHtml) && | |
checkContentText.contains(nextText)) { | |
// System.out.println( "*" + nextHtml ); | |
// System.out.println( "\t" + nextText ); | |
// System.out.println( nextText.length() ); | |
if (containTrimClassID(nextNode)) { | |
// 預防過濾掉整個正文字串 | |
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
// System.out.printf("\t過濾掉:%s\n", nextText ); | |
trimTextMap.put(nextHtml, nextText.length()); | |
} | |
} | |
if (linkNodeList.contains(nextNode)) { | |
int linkTextLen = getLinkTextLength(nextNode); | |
double linkTextRatio = (double) linkTextLen / (double) nextText.length(); | |
if (linkTextRatio >= 0.65) { | |
if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
trimTextMap.put(nextHtml, nextText.length()); | |
// System.out.println( "\t連結區: " + linkTextRatio ); | |
} | |
} | |
} | |
} else | |
break; | |
} | |
/** | |
* 不在上面直接過濾而且是由字串長的過濾到短的是因為 | |
* 如果要過濾的字串是很短的話,過濾掉後會影響到長字串的過濾 | |
* ex: 「我今天去了淡水,還有去淡水老街。」 | |
* 先過濾「淡水」 ==> 「我今天去了,還有去老街。」 | |
* 再過濾「淡水老街」 ==> 無法過濾掉「淡水老街」,因為字串剩下「老街」 | |
* 過濾不完全!! | |
* 但如果先過濾掉「淡水老街」 ==> 「我今天去了淡水,還有去。」 | |
* 在過濾掉「淡水」 ==> 「我今天去了,還有去。」 | |
* 完全過濾!! | |
*/ | |
trimTextMap = MapUtils.sortByValue(trimTextMap, true); | |
for (String html : trimTextMap.keySet()) { | |
// System.out.println( "Filter=" + html ); | |
contentHtml = contentHtml.replace(html, ""); | |
} | |
contentProp.setBlockHtml( | |
org.apache.commons.lang3.StringUtils.replacePattern( | |
org.apache.commons.lang3.StringUtils.replacePattern(contentHtml, HtmlTag.BR_REPLACE_REGEX, " "), HtmlTag.STYLE_REPLACE_REGEX, "")); | |
return contentProp; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment