Last active
August 29, 2015 14:20
-
-
Save Bloodsucker/62c43c938b80b1b7c8ab to your computer and use it in GitHub Desktop.
Bloque de código de un Filtro de Boilerpipe que busca el final de la sección releavante de una web y lo hace usando palabras del inglés
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public boolean process(TextDocument doc) | |
throws BoilerpipeProcessingException { | |
boolean changes = false; | |
// long t = System.currentTimeMillis(); | |
for (TextBlock tb : doc.getTextBlocks()) { | |
final int numWords = tb.getNumWords(); | |
if (numWords < 15) { | |
final String text = tb.getText().trim(); | |
final int len = text.length(); | |
if (len >= 8) { | |
final String textLC = text.toLowerCase(); | |
if (textLC.startsWith("comments") | |
|| startsWithNumber(textLC, len, " comments", | |
" users responded in") | |
|| textLC.startsWith("© reuters") | |
|| textLC.startsWith("please rate this") | |
|| textLC.startsWith("post a comment") | |
|| textLC.contains("what you think...") | |
|| textLC.contains("add your comment") | |
|| textLC.contains("add comment") | |
|| textLC.contains("reader views") | |
|| textLC.contains("have your say") | |
|| textLC.contains("reader comments") | |
|| textLC.contains("rätta artikeln") | |
|| textLC | |
.equals("thanks for your comments - this feedback is now closed")) { | |
tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT); | |
changes = true; | |
} | |
} | |
} | |
} | |
// timeSpent += System.currentTimeMillis() - t; | |
return changes; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment