Skip to content

Instantly share code, notes, and snippets.

@juanjux

juanjux/crash.py Secret

Created March 7, 2018 17:12
Show Gist options
  • Save juanjux/5d7f8b736fde2bbc8635c5f4f689b573 to your computer and use it in GitHub Desktop.
Save juanjux/5d7f8b736fde2bbc8635c5f4f689b573 to your computer and use it in GitHub Desktop.
crash.py
b'/* \n * ScreenSlicer (TM)\n * Copyright (C) 2013-2015 Machine Publishers, LLC\n * ops@machinepublishers.com | screenslicer.com | machinepublishers.com\n * Cincinnati, Ohio, USA\n *\n * You can redistribute this program and/or modify it under the terms of the GNU Affero General Public\n * License version 3 as published by the Free Software Foundation.\n *\n * ScreenSlicer is made available under the terms of the GNU Affero General Public License version 3\n * with the following clarification and special exception:\n *\n * Linking ScreenSlicer statically or dynamically with other modules is making a combined work\n * based on ScreenSlicer. Thus, the terms and conditions of the GNU Affero General Public License\n * version 3 cover the whole combination.\n *\n * As a special exception, Machine Publishers, LLC gives you permission to link unmodified versions\n * of ScreenSlicer with independent modules to produce an executable, regardless of the license\n * terms of these independent modules, and to copy, distribute, and make available the resulting\n * executable under terms of your choice, provided that you also meet, for each linked independent\n * module, the terms and conditions of the license of that module. An independent module is a module\n * which is not derived from or based on ScreenSlicer. If you modify ScreenSlicer, you may not\n * extend this exception to your modified version of ScreenSlicer.\n *\n * "ScreenSlicer", "jBrowserDriver", "Machine Publishers", and "automatic, zero-config web scraping"\n * are trademarks of Machine Publishers, LLC.\n * \n * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\n * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n * Affero General Public License version 3 for more details.\n * \n * You should have received a copy of the GNU Affero General Public License version 3 along with this\n * program. If not, see <http://www.gnu.org/licenses/>.\n * \n * For general details about how to investigate and report license violations, please see:\n * <https://www.gnu.org/licenses/gpl-violation.html> and email the author: ops@machinepublishers.com\n */\npackage com.screenslicer.core.scrape;\n\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collection;\nimport java.util.HashMap;\nimport java.util.HashSet;\nimport java.util.LinkedHashMap;\nimport java.util.List;\nimport java.util.Map;\n\nimport org.jsoup.nodes.Element;\nimport org.jsoup.nodes.Node;\nimport org.jsoup.select.NodeVisitor;\n\nimport com.screenslicer.api.datatype.HtmlNode;\nimport com.screenslicer.core.scrape.neural.NeuralNetManager;\nimport com.screenslicer.core.scrape.type.ComparableNode;\nimport com.screenslicer.core.util.NodeUtil;\n\npublic class Extract {\n private static final int SCORE_PENALTY = 10000;\n private static final int TWICE = 2;\n private static HashMap<Element, ComparableNode[]> nodesCache = new HashMap<Element, ComparableNode[]>();\n\n private static class TrainingData {\n private final ComparableNode target;\n private int finalMisses = 0;\n private boolean winner = false;\n private int winnerDistance = 0;\n private ComparableNode best = null;\n\n public TrainingData(ComparableNode target) {\n this.target = target;\n }\n }\n\n private static ComparableNode best(ComparableNode[] nodes, Integer[][] comparisonCache, Collection<Node> ignore, TrainingData trainingData) {\n int ignoreSize = ignore == null ? 0 : ignore.size();\n if (nodes.length - ignoreSize == 1) {\n if (ignore == null || ignore.isEmpty()) {\n return nodes[0];\n }\n for (int i = 0; i < nodes.length; i++) {\n if (!ignore.contains(nodes[i])) {\n return nodes[i];\n }\n }\n }\n if (comparisonCache == null) {\n comparisonCache = new Integer[nodes.length][nodes.length];\n }\n int adjustedLen = nodes.length - ignore.size();\n for (int failMax = 0; failMax < adjustedLen; failMax++) {\n Map<ComparableNode, Integer> winners = new HashMap<ComparableNode, Integer>();\n for (int i = 0; i < nodes.length; i++) {\n if (ignore != null && ignore.contains(nodes[i].node())) {\n continue;\n }\n boolean found = true;\n int fail = 0;\n for (int j = 0; j < nodes.length; j++) {\n if (ignore != null && ignore.contains(nodes[j].node())) {\n continue;\n }\n if (nodes[j] != null) {\n if (comparisonCache[i][j] == null) {\n int result = nodes[i].compare(nodes[j]);\n if (result != -1) {\n ++fail;\n }\n comparisonCache[i][j] = new Integer(result);\n comparisonCache[j][i] = new Integer(result * (-1));\n } else if (comparisonCache[i][j].intValue() != -1) {\n ++fail;\n }\n if (fail > failMax) {\n found = false;\n break;\n }\n }\n }\n if (found) {\n if (failMax == 0) {\n if (trainingData != null) {\n trainingData.winner = trainingData.target.equals(nodes[i]);\n trainingData.finalMisses = trainingData.winner ? 0 : 1;\n trainingData.winnerDistance = 0;\n trainingData.best = nodes[i];\n }\n return nodes[i];\n }\n winners.put(nodes[i], i);\n }\n }\n if (winners.size() == 1) {\n ComparableNode ret = winners.keySet().toArray(new ComparableNode[1])[0];\n if (trainingData != null) {\n trainingData.winner = trainingData.target.equals(ret);\n trainingData.finalMisses = trainingData.winner ? 0 : 1;\n trainingData.winnerDistance = failMax;\n trainingData.best = ret;\n }\n return ret;\n }\n if (!winners.isEmpty()) {\n int targetIndex = -1;\n ComparableNode[] winnersArray = winners.keySet().toArray(new ComparableNode[0]);\n if (trainingData != null) {\n trainingData.winnerDistance = failMax;\n if (winners.containsKey(trainingData.target)) {\n for (int i = 0; i < winnersArray.length; i++) {\n if (trainingData.target.equals(winnersArray[i])) {\n targetIndex = i;\n break;\n }\n }\n }\n if (targetIndex == -1) {\n trainingData.finalMisses = winners.size();\n trainingData.winner = false;\n }\n }\n for (int i = 0; i < winnersArray.length; i++) {\n boolean found = true;\n for (int j = 0; j < winnersArray.length; j++) {\n if (i != j) {\n int iCache = winners.get(winnersArray[i]);\n int jCache = winners.get(winnersArray[j]);\n if (comparisonCache[iCache][jCache] == null) {\n int result = winnersArray[i].compare(winnersArray[j]);\n comparisonCache[iCache][jCache] = new Integer(result);\n comparisonCache[jCache][iCache] = new Integer(result * (-1));\n }\n if (comparisonCache[iCache][jCache].intValue() != -1) {\n found = false;\n if (i != targetIndex) {\n break;\n } else if (trainingData != null) {\n ++trainingData.finalMisses;\n }\n }\n }\n }\n if (found) {\n if (trainingData != null) {\n trainingData.best = winnersArray[i];\n }\n if (targetIndex == i && trainingData != null) {\n trainingData.finalMisses = 0;\n trainingData.winner = true;\n return trainingData.target;\n } else if (targetIndex == -1 || targetIndex < i) {\n if (trainingData != null) {\n trainingData.winner = false;\n }\n return winnersArray[i];\n }\n } else if (targetIndex == i\n && trainingData != null\n && trainingData.best != null) {\n trainingData.winner = false;\n return trainingData.best;\n }\n }\n return null;\n }\n }\n return null;\n }\n\n public static ComparableNode[] trainInit(Element body, int page, int thread) {\n ComparableNode[] nodesArray = performInternal(body, page, null, null, null, thread);\n nodesCache.put(body, nodesArray);\n return nodesArray;\n }\n\n public static int train(Element body, int page, ComparableNode target, int targetIndex, int thread) {\n ComparableNode[] nodesArray = null;\n nodesArray = nodesCache.get(body);\n int score = 0;\n if (NeuralNetManager.instance(thread).isMulti()) {\n int votes = 0;\n final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n boolean won = false;\n ComparableNode fallback = null;\n int[] distances = new int[NeuralNetManager.instance(thread).multiSize()];\n int curDistance = 0;\n Map<ComparableNode, Integer> votesMap = new HashMap<ComparableNode, Integer>();\n if (targetIndex < 0) {\n for (int i = 0; i < nodesArray.length; i++) {\n if (nodesArray[i].equals(target)) {\n targetIndex = i;\n break;\n }\n }\n }\n while (NeuralNetManager.instance(thread).hasNext()) {\n Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n int distance = 0;\n for (int i = 0; i < nodesArray.length; i++) {\n if (!target.equals(nodesArray[i])) {\n int result = target.compare(nodesArray[i]);\n if (result != -1) {\n ++distance;\n }\n comparisonCache[targetIndex][i] = new Integer(result);\n comparisonCache[i][targetIndex] = new Integer(result * (-1));\n }\n }\n TrainingData trainingData = new TrainingData(target);\n ComparableNode tmp = best(nodesArray, comparisonCache, null, trainingData);\n if (tmp != null) {\n fallback = tmp;\n }\n if (trainingData.best != null) {\n if (!votesMap.containsKey(trainingData.best)) {\n votesMap.put(trainingData.best, new Integer(1));\n } else {\n votesMap.put(trainingData.best,\n new Integer(votesMap.get(trainingData.best).intValue() + 1));\n }\n }\n distance = (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n NeuralNetManager.instance(thread).next();\n if (trainingData.winner) {\n ++votes;\n }\n if (votes == majority) {\n won = true;\n break;\n }\n distances[curDistance++] = distance;\n }\n NeuralNetManager.instance(thread).resetNext();\n if (!won) {\n int maxVotes = 0;\n ComparableNode maxComparableNode = null;\n for (Map.Entry<ComparableNode, Integer> entry : votesMap.entrySet()) {\n if (entry.getValue().intValue() == maxVotes) {\n maxComparableNode = null;\n } else if (entry.getValue().intValue() > maxVotes) {\n maxVotes = entry.getValue().intValue();\n maxComparableNode = entry.getKey();\n }\n }\n if (maxComparableNode == null) {\n maxComparableNode = fallback;\n }\n if (!target.equals(maxComparableNode)) {\n int totalDistance = 0;\n Arrays.sort(distances);\n for (int i = 0; i < majority; i++) {\n totalDistance += distances[i];\n }\n score += totalDistance + SCORE_PENALTY;\n }\n }\n } else {\n int distance = 0;\n Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n if (targetIndex < 0) {\n for (int i = 0; i < nodesArray.length; i++) {\n if (nodesArray[i].equals(target)) {\n targetIndex = i;\n break;\n }\n }\n }\n for (int i = 0; i < nodesArray.length; i++) {\n if (!target.equals(nodesArray[i])) {\n int result = target.compare(nodesArray[i]);\n if (result != -1) {\n ++distance;\n }\n comparisonCache[targetIndex][i] = new Integer(result);\n comparisonCache[i][targetIndex] = new Integer(result * (-1));\n }\n }\n TrainingData trainingData = new TrainingData(target);\n best(nodesArray, comparisonCache, null, trainingData);\n score += (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n score += trainingData.winner ? 0 : SCORE_PENALTY;\n }\n return score;\n }\n\n private static ComparableNode[] performInternal(final Element body, final int page,\n final HtmlNode matchResult, final HtmlNode matchParent, final Collection<Node> ignore, int thread) {\n final Map<Node, ComparableNode> nodes = new HashMap<Node, ComparableNode>();\n if (body != null) {\n body.traverse(new NodeVisitor() {\n @Override\n public void head(Node node, int depth) {\n int nonEmptyChildren = 0;\n for (Node child : node.childNodes()) {\n if (!NodeUtil.isEmpty(child)) {\n nonEmptyChildren++;\n }\n }\n if (!NodeUtil.isEmpty(node)\n && NodeUtil.isContent(node, matchResult, matchParent) && nonEmptyChildren > 0) {\n nodes.put(node, new ComparableNode(node, matchResult, matchParent, thread));\n }\n }\n\n @Override\n public void tail(Node node, int depth) {}\n });\n }\n return nodes.values().toArray(new ComparableNode[0]);\n }\n\n public static class Cache {\n public ComparableNode[] nodesCache = null;\n public Integer[][][] comparisonCache = null;\n }\n\n public static List<Node> perform(Element body, int page, Collection<Node> ignore,\n HtmlNode matchResult, HtmlNode matchParent, Cache cache, int thread) {\n Map<ComparableNode, Integer> votes = new LinkedHashMap<ComparableNode, Integer>();\n if (cache == null) {\n cache = new Cache();\n }\n if (cache.nodesCache == null) {\n cache.nodesCache = performInternal(body, page, matchResult, matchParent, ignore, thread);\n cache.comparisonCache = new Integer[NeuralNetManager.instance(thread).multiSize()]\n [cache.nodesCache.length][cache.nodesCache.length];\n }\n final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n Node best = null;\n int cur = 0;\n NeuralNetManager.instance(thread).resetNext();\n while (NeuralNetManager.instance(thread).hasNext()) {\n ComparableNode winner = best(cache.nodesCache, cache.comparisonCache[cur++],\n new HashSet<Node>(ignore), null);\n NeuralNetManager.instance(thread).next();\n if (winner != null) {\n if (!votes.containsKey(winner)) {\n votes.put(winner, new Integer(1));\n } else {\n votes.put(winner, new Integer(votes.get(winner).intValue() + 1));\n }\n if (votes.get(winner).intValue() == majority) {\n best = winner.node();\n break;\n }\n }\n }\n if (best == null) {\n int bestVotes = 0;\n List<Node> bestNodes = new ArrayList<Node>();\n for (Map.Entry<ComparableNode, Integer> entry : votes.entrySet()) {\n int val = entry.getValue().intValue();\n if (val >= bestVotes) {\n if (val > bestVotes) {\n bestVotes = val;\n bestNodes.clear();\n }\n bestNodes.add(entry.getKey().node());\n }\n }\n return bestNodes;\n }\n return Arrays.asList(new Node[] { best });\n }\n}\n'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment