-
-
Save juanjux/5d7f8b736fde2bbc8635c5f4f689b573 to your computer and use it in GitHub Desktop.
crash.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
b'/* \n * ScreenSlicer (TM)\n * Copyright (C) 2013-2015 Machine Publishers, LLC\n * ops@machinepublishers.com | screenslicer.com | machinepublishers.com\n * Cincinnati, Ohio, USA\n *\n * You can redistribute this program and/or modify it under the terms of the GNU Affero General Public\n * License version 3 as published by the Free Software Foundation.\n *\n * ScreenSlicer is made available under the terms of the GNU Affero General Public License version 3\n * with the following clarification and special exception:\n *\n * Linking ScreenSlicer statically or dynamically with other modules is making a combined work\n * based on ScreenSlicer. Thus, the terms and conditions of the GNU Affero General Public License\n * version 3 cover the whole combination.\n *\n * As a special exception, Machine Publishers, LLC gives you permission to link unmodified versions\n * of ScreenSlicer with independent modules to produce an executable, regardless of the license\n * terms of these independent modules, and to copy, distribute, and make available the resulting\n * executable under terms of your choice, provided that you also meet, for each linked independent\n * module, the terms and conditions of the license of that module. An independent module is a module\n * which is not derived from or based on ScreenSlicer. If you modify ScreenSlicer, you may not\n * extend this exception to your modified version of ScreenSlicer.\n *\n * "ScreenSlicer", "jBrowserDriver", "Machine Publishers", and "automatic, zero-config web scraping"\n * are trademarks of Machine Publishers, LLC.\n * \n * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\n * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n * Affero General Public License version 3 for more details.\n * \n * You should have received a copy of the GNU Affero General Public License version 3 along with this\n * program. If not, see <http://www.gnu.org/licenses/>.\n * \n * For general details about how to investigate and report license violations, please see:\n * <https://www.gnu.org/licenses/gpl-violation.html> and email the author: ops@machinepublishers.com\n */\npackage com.screenslicer.core.scrape;\n\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collection;\nimport java.util.HashMap;\nimport java.util.HashSet;\nimport java.util.LinkedHashMap;\nimport java.util.List;\nimport java.util.Map;\n\nimport org.jsoup.nodes.Element;\nimport org.jsoup.nodes.Node;\nimport org.jsoup.select.NodeVisitor;\n\nimport com.screenslicer.api.datatype.HtmlNode;\nimport com.screenslicer.core.scrape.neural.NeuralNetManager;\nimport com.screenslicer.core.scrape.type.ComparableNode;\nimport com.screenslicer.core.util.NodeUtil;\n\npublic class Extract {\n private static final int SCORE_PENALTY = 10000;\n private static final int TWICE = 2;\n private static HashMap<Element, ComparableNode[]> nodesCache = new HashMap<Element, ComparableNode[]>();\n\n private static class TrainingData {\n private final ComparableNode target;\n private int finalMisses = 0;\n private boolean winner = false;\n private int winnerDistance = 0;\n private ComparableNode best = null;\n\n public TrainingData(ComparableNode target) {\n this.target = target;\n }\n }\n\n private static ComparableNode best(ComparableNode[] nodes, Integer[][] comparisonCache, Collection<Node> ignore, TrainingData trainingData) {\n int ignoreSize = ignore == null ? 0 : ignore.size();\n if (nodes.length - ignoreSize == 1) {\n if (ignore == null || ignore.isEmpty()) {\n return nodes[0];\n }\n for (int i = 0; i < nodes.length; i++) {\n if (!ignore.contains(nodes[i])) {\n return nodes[i];\n }\n }\n }\n if (comparisonCache == null) {\n comparisonCache = new Integer[nodes.length][nodes.length];\n }\n int adjustedLen = nodes.length - ignore.size();\n for (int failMax = 0; failMax < adjustedLen; failMax++) {\n Map<ComparableNode, Integer> winners = new HashMap<ComparableNode, Integer>();\n for (int i = 0; i < nodes.length; i++) {\n if (ignore != null && ignore.contains(nodes[i].node())) {\n continue;\n }\n boolean found = true;\n int fail = 0;\n for (int j = 0; j < nodes.length; j++) {\n if (ignore != null && ignore.contains(nodes[j].node())) {\n continue;\n }\n if (nodes[j] != null) {\n if (comparisonCache[i][j] == null) {\n int result = nodes[i].compare(nodes[j]);\n if (result != -1) {\n ++fail;\n }\n comparisonCache[i][j] = new Integer(result);\n comparisonCache[j][i] = new Integer(result * (-1));\n } else if (comparisonCache[i][j].intValue() != -1) {\n ++fail;\n }\n if (fail > failMax) {\n found = false;\n break;\n }\n }\n }\n if (found) {\n if (failMax == 0) {\n if (trainingData != null) {\n trainingData.winner = trainingData.target.equals(nodes[i]);\n trainingData.finalMisses = trainingData.winner ? 0 : 1;\n trainingData.winnerDistance = 0;\n trainingData.best = nodes[i];\n }\n return nodes[i];\n }\n winners.put(nodes[i], i);\n }\n }\n if (winners.size() == 1) {\n ComparableNode ret = winners.keySet().toArray(new ComparableNode[1])[0];\n if (trainingData != null) {\n trainingData.winner = trainingData.target.equals(ret);\n trainingData.finalMisses = trainingData.winner ? 0 : 1;\n trainingData.winnerDistance = failMax;\n trainingData.best = ret;\n }\n return ret;\n }\n if (!winners.isEmpty()) {\n int targetIndex = -1;\n ComparableNode[] winnersArray = winners.keySet().toArray(new ComparableNode[0]);\n if (trainingData != null) {\n trainingData.winnerDistance = failMax;\n if (winners.containsKey(trainingData.target)) {\n for (int i = 0; i < winnersArray.length; i++) {\n if (trainingData.target.equals(winnersArray[i])) {\n targetIndex = i;\n break;\n }\n }\n }\n if (targetIndex == -1) {\n trainingData.finalMisses = winners.size();\n trainingData.winner = false;\n }\n }\n for (int i = 0; i < winnersArray.length; i++) {\n boolean found = true;\n for (int j = 0; j < winnersArray.length; j++) {\n if (i != j) {\n int iCache = winners.get(winnersArray[i]);\n int jCache = winners.get(winnersArray[j]);\n if (comparisonCache[iCache][jCache] == null) {\n int result = winnersArray[i].compare(winnersArray[j]);\n comparisonCache[iCache][jCache] = new Integer(result);\n comparisonCache[jCache][iCache] = new Integer(result * (-1));\n }\n if (comparisonCache[iCache][jCache].intValue() != -1) {\n found = false;\n if (i != targetIndex) {\n break;\n } else if (trainingData != null) {\n ++trainingData.finalMisses;\n }\n }\n }\n }\n if (found) {\n if (trainingData != null) {\n trainingData.best = winnersArray[i];\n }\n if (targetIndex == i && trainingData != null) {\n trainingData.finalMisses = 0;\n trainingData.winner = true;\n return trainingData.target;\n } else if (targetIndex == -1 || targetIndex < i) {\n if (trainingData != null) {\n trainingData.winner = false;\n }\n return winnersArray[i];\n }\n } else if (targetIndex == i\n && trainingData != null\n && trainingData.best != null) {\n trainingData.winner = false;\n return trainingData.best;\n }\n }\n return null;\n }\n }\n return null;\n }\n\n public static ComparableNode[] trainInit(Element body, int page, int thread) {\n ComparableNode[] nodesArray = performInternal(body, page, null, null, null, thread);\n nodesCache.put(body, nodesArray);\n return nodesArray;\n }\n\n public static int train(Element body, int page, ComparableNode target, int targetIndex, int thread) {\n ComparableNode[] nodesArray = null;\n nodesArray = nodesCache.get(body);\n int score = 0;\n if (NeuralNetManager.instance(thread).isMulti()) {\n int votes = 0;\n final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n boolean won = false;\n ComparableNode fallback = null;\n int[] distances = new int[NeuralNetManager.instance(thread).multiSize()];\n int curDistance = 0;\n Map<ComparableNode, Integer> votesMap = new HashMap<ComparableNode, Integer>();\n if (targetIndex < 0) {\n for (int i = 0; i < nodesArray.length; i++) {\n if (nodesArray[i].equals(target)) {\n targetIndex = i;\n break;\n }\n }\n }\n while (NeuralNetManager.instance(thread).hasNext()) {\n Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n int distance = 0;\n for (int i = 0; i < nodesArray.length; i++) {\n if (!target.equals(nodesArray[i])) {\n int result = target.compare(nodesArray[i]);\n if (result != -1) {\n ++distance;\n }\n comparisonCache[targetIndex][i] = new Integer(result);\n comparisonCache[i][targetIndex] = new Integer(result * (-1));\n }\n }\n TrainingData trainingData = new TrainingData(target);\n ComparableNode tmp = best(nodesArray, comparisonCache, null, trainingData);\n if (tmp != null) {\n fallback = tmp;\n }\n if (trainingData.best != null) {\n if (!votesMap.containsKey(trainingData.best)) {\n votesMap.put(trainingData.best, new Integer(1));\n } else {\n votesMap.put(trainingData.best,\n new Integer(votesMap.get(trainingData.best).intValue() + 1));\n }\n }\n distance = (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n NeuralNetManager.instance(thread).next();\n if (trainingData.winner) {\n ++votes;\n }\n if (votes == majority) {\n won = true;\n break;\n }\n distances[curDistance++] = distance;\n }\n NeuralNetManager.instance(thread).resetNext();\n if (!won) {\n int maxVotes = 0;\n ComparableNode maxComparableNode = null;\n for (Map.Entry<ComparableNode, Integer> entry : votesMap.entrySet()) {\n if (entry.getValue().intValue() == maxVotes) {\n maxComparableNode = null;\n } else if (entry.getValue().intValue() > maxVotes) {\n maxVotes = entry.getValue().intValue();\n maxComparableNode = entry.getKey();\n }\n }\n if (maxComparableNode == null) {\n maxComparableNode = fallback;\n }\n if (!target.equals(maxComparableNode)) {\n int totalDistance = 0;\n Arrays.sort(distances);\n for (int i = 0; i < majority; i++) {\n totalDistance += distances[i];\n }\n score += totalDistance + SCORE_PENALTY;\n }\n }\n } else {\n int distance = 0;\n Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n if (targetIndex < 0) {\n for (int i = 0; i < nodesArray.length; i++) {\n if (nodesArray[i].equals(target)) {\n targetIndex = i;\n break;\n }\n }\n }\n for (int i = 0; i < nodesArray.length; i++) {\n if (!target.equals(nodesArray[i])) {\n int result = target.compare(nodesArray[i]);\n if (result != -1) {\n ++distance;\n }\n comparisonCache[targetIndex][i] = new Integer(result);\n comparisonCache[i][targetIndex] = new Integer(result * (-1));\n }\n }\n TrainingData trainingData = new TrainingData(target);\n best(nodesArray, comparisonCache, null, trainingData);\n score += (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n score += trainingData.winner ? 0 : SCORE_PENALTY;\n }\n return score;\n }\n\n private static ComparableNode[] performInternal(final Element body, final int page,\n final HtmlNode matchResult, final HtmlNode matchParent, final Collection<Node> ignore, int thread) {\n final Map<Node, ComparableNode> nodes = new HashMap<Node, ComparableNode>();\n if (body != null) {\n body.traverse(new NodeVisitor() {\n @Override\n public void head(Node node, int depth) {\n int nonEmptyChildren = 0;\n for (Node child : node.childNodes()) {\n if (!NodeUtil.isEmpty(child)) {\n nonEmptyChildren++;\n }\n }\n if (!NodeUtil.isEmpty(node)\n && NodeUtil.isContent(node, matchResult, matchParent) && nonEmptyChildren > 0) {\n nodes.put(node, new ComparableNode(node, matchResult, matchParent, thread));\n }\n }\n\n @Override\n public void tail(Node node, int depth) {}\n });\n }\n return nodes.values().toArray(new ComparableNode[0]);\n }\n\n public static class Cache {\n public ComparableNode[] nodesCache = null;\n public Integer[][][] comparisonCache = null;\n }\n\n public static List<Node> perform(Element body, int page, Collection<Node> ignore,\n HtmlNode matchResult, HtmlNode matchParent, Cache cache, int thread) {\n Map<ComparableNode, Integer> votes = new LinkedHashMap<ComparableNode, Integer>();\n if (cache == null) {\n cache = new Cache();\n }\n if (cache.nodesCache == null) {\n cache.nodesCache = performInternal(body, page, matchResult, matchParent, ignore, thread);\n cache.comparisonCache = new Integer[NeuralNetManager.instance(thread).multiSize()]\n [cache.nodesCache.length][cache.nodesCache.length];\n }\n final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n Node best = null;\n int cur = 0;\n NeuralNetManager.instance(thread).resetNext();\n while (NeuralNetManager.instance(thread).hasNext()) {\n ComparableNode winner = best(cache.nodesCache, cache.comparisonCache[cur++],\n new HashSet<Node>(ignore), null);\n NeuralNetManager.instance(thread).next();\n if (winner != null) {\n if (!votes.containsKey(winner)) {\n votes.put(winner, new Integer(1));\n } else {\n votes.put(winner, new Integer(votes.get(winner).intValue() + 1));\n }\n if (votes.get(winner).intValue() == majority) {\n best = winner.node();\n break;\n }\n }\n }\n if (best == null) {\n int bestVotes = 0;\n List<Node> bestNodes = new ArrayList<Node>();\n for (Map.Entry<ComparableNode, Integer> entry : votes.entrySet()) {\n int val = entry.getValue().intValue();\n if (val >= bestVotes) {\n if (val > bestVotes) {\n bestVotes = val;\n bestNodes.clear();\n }\n bestNodes.add(entry.getKey().node());\n }\n }\n return bestNodes;\n }\n return Arrays.asList(new Node[] { best });\n }\n}\n' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment