juanjux/crash.py Secret

## crash.py
b'/* \n * ScreenSlicer (TM)\n * Copyright (C) 2013-2015 Machine Publishers, LLC\n * ops@machinepublishers.com | screenslicer.com | machinepublishers.com\n * Cincinnati, Ohio, USA\n *\n * You can redistribute this program and/or modify it under the terms of the GNU Affero General Public\n * License version 3 as published by the Free Software Foundation.\n *\n * ScreenSlicer is made available under the terms of the GNU Affero General Public License version 3\n * with the following clarification and special exception:\n *\n *   Linking ScreenSlicer statically or dynamically with other modules is making a combined work\n *   based on ScreenSlicer. Thus, the terms and conditions of the GNU Affero General Public License\n *   version 3 cover the whole combination.\n *\n *   As a special exception, Machine Publishers, LLC gives you permission to link unmodified versions\n *   of ScreenSlicer with independent modules to produce an executable, regardless of the license\n *   terms of these independent modules, and to copy, distribute, and make available the resulting\n *   executable under terms of your choice, provided that you also meet, for each linked independent\n *   module, the terms and conditions of the license of that module. An independent module is a module\n *   which is not derived from or based on ScreenSlicer. If you modify ScreenSlicer, you may not\n *   extend this exception to your modified version of ScreenSlicer.\n *\n * "ScreenSlicer", "jBrowserDriver", "Machine Publishers", and "automatic, zero-config web scraping"\n * are trademarks of Machine Publishers, LLC.\n * \n * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\n * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n * Affero General Public License version 3 for more details.\n * \n * You should have received a copy of the GNU Affero General Public License version 3 along with this\n * program. If not, see <http://www.gnu.org/licenses/>.\n * \n * For general details about how to investigate and report license violations, please see:\n * <https://www.gnu.org/licenses/gpl-violation.html> and email the author: ops@machinepublishers.com\n */\npackage com.screenslicer.core.scrape;\n\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collection;\nimport java.util.HashMap;\nimport java.util.HashSet;\nimport java.util.LinkedHashMap;\nimport java.util.List;\nimport java.util.Map;\n\nimport org.jsoup.nodes.Element;\nimport org.jsoup.nodes.Node;\nimport org.jsoup.select.NodeVisitor;\n\nimport com.screenslicer.api.datatype.HtmlNode;\nimport com.screenslicer.core.scrape.neural.NeuralNetManager;\nimport com.screenslicer.core.scrape.type.ComparableNode;\nimport com.screenslicer.core.util.NodeUtil;\n\npublic class Extract {\n  private static final int SCORE_PENALTY = 10000;\n  private static final int TWICE = 2;\n  private static HashMap<Element, ComparableNode[]> nodesCache = new HashMap<Element, ComparableNode[]>();\n\n  private static class TrainingData {\n    private final ComparableNode target;\n    private int finalMisses = 0;\n    private boolean winner = false;\n    private int winnerDistance = 0;\n    private ComparableNode best = null;\n\n    public TrainingData(ComparableNode target) {\n      this.target = target;\n    }\n  }\n\n  private static ComparableNode best(ComparableNode[] nodes, Integer[][] comparisonCache, Collection<Node> ignore, TrainingData trainingData) {\n    int ignoreSize = ignore == null ? 0 : ignore.size();\n    if (nodes.length - ignoreSize == 1) {\n      if (ignore == null || ignore.isEmpty()) {\n        return nodes[0];\n      }\n      for (int i = 0; i < nodes.length; i++) {\n        if (!ignore.contains(nodes[i])) {\n          return nodes[i];\n        }\n      }\n    }\n    if (comparisonCache == null) {\n      comparisonCache = new Integer[nodes.length][nodes.length];\n    }\n    int adjustedLen = nodes.length - ignore.size();\n    for (int failMax = 0; failMax < adjustedLen; failMax++) {\n      Map<ComparableNode, Integer> winners = new HashMap<ComparableNode, Integer>();\n      for (int i = 0; i < nodes.length; i++) {\n        if (ignore != null && ignore.contains(nodes[i].node())) {\n          continue;\n        }\n        boolean found = true;\n        int fail = 0;\n        for (int j = 0; j < nodes.length; j++) {\n          if (ignore != null && ignore.contains(nodes[j].node())) {\n            continue;\n          }\n          if (nodes[j] != null) {\n            if (comparisonCache[i][j] == null) {\n              int result = nodes[i].compare(nodes[j]);\n              if (result != -1) {\n                ++fail;\n              }\n              comparisonCache[i][j] = new Integer(result);\n              comparisonCache[j][i] = new Integer(result * (-1));\n            } else if (comparisonCache[i][j].intValue() != -1) {\n              ++fail;\n            }\n            if (fail > failMax) {\n              found = false;\n              break;\n            }\n          }\n        }\n        if (found) {\n          if (failMax == 0) {\n            if (trainingData != null) {\n              trainingData.winner = trainingData.target.equals(nodes[i]);\n              trainingData.finalMisses = trainingData.winner ? 0 : 1;\n              trainingData.winnerDistance = 0;\n              trainingData.best = nodes[i];\n            }\n            return nodes[i];\n          }\n          winners.put(nodes[i], i);\n        }\n      }\n      if (winners.size() == 1) {\n        ComparableNode ret = winners.keySet().toArray(new ComparableNode[1])[0];\n        if (trainingData != null) {\n          trainingData.winner = trainingData.target.equals(ret);\n          trainingData.finalMisses = trainingData.winner ? 0 : 1;\n          trainingData.winnerDistance = failMax;\n          trainingData.best = ret;\n        }\n        return ret;\n      }\n      if (!winners.isEmpty()) {\n        int targetIndex = -1;\n        ComparableNode[] winnersArray = winners.keySet().toArray(new ComparableNode[0]);\n        if (trainingData != null) {\n          trainingData.winnerDistance = failMax;\n          if (winners.containsKey(trainingData.target)) {\n            for (int i = 0; i < winnersArray.length; i++) {\n              if (trainingData.target.equals(winnersArray[i])) {\n                targetIndex = i;\n                break;\n              }\n            }\n          }\n          if (targetIndex == -1) {\n            trainingData.finalMisses = winners.size();\n            trainingData.winner = false;\n          }\n        }\n        for (int i = 0; i < winnersArray.length; i++) {\n          boolean found = true;\n          for (int j = 0; j < winnersArray.length; j++) {\n            if (i != j) {\n              int iCache = winners.get(winnersArray[i]);\n              int jCache = winners.get(winnersArray[j]);\n              if (comparisonCache[iCache][jCache] == null) {\n                int result = winnersArray[i].compare(winnersArray[j]);\n                comparisonCache[iCache][jCache] = new Integer(result);\n                comparisonCache[jCache][iCache] = new Integer(result * (-1));\n              }\n              if (comparisonCache[iCache][jCache].intValue() != -1) {\n                found = false;\n                if (i != targetIndex) {\n                  break;\n                } else if (trainingData != null) {\n                  ++trainingData.finalMisses;\n                }\n              }\n            }\n          }\n          if (found) {\n            if (trainingData != null) {\n              trainingData.best = winnersArray[i];\n            }\n            if (targetIndex == i && trainingData != null) {\n              trainingData.finalMisses = 0;\n              trainingData.winner = true;\n              return trainingData.target;\n            } else if (targetIndex == -1 || targetIndex < i) {\n              if (trainingData != null) {\n                trainingData.winner = false;\n              }\n              return winnersArray[i];\n            }\n          } else if (targetIndex == i\n              && trainingData != null\n              && trainingData.best != null) {\n            trainingData.winner = false;\n            return trainingData.best;\n          }\n        }\n        return null;\n      }\n    }\n    return null;\n  }\n\n  public static ComparableNode[] trainInit(Element body, int page, int thread) {\n    ComparableNode[] nodesArray = performInternal(body, page, null, null, null, thread);\n    nodesCache.put(body, nodesArray);\n    return nodesArray;\n  }\n\n  public static int train(Element body, int page, ComparableNode target, int targetIndex, int thread) {\n    ComparableNode[] nodesArray = null;\n    nodesArray = nodesCache.get(body);\n    int score = 0;\n    if (NeuralNetManager.instance(thread).isMulti()) {\n      int votes = 0;\n      final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n      boolean won = false;\n      ComparableNode fallback = null;\n      int[] distances = new int[NeuralNetManager.instance(thread).multiSize()];\n      int curDistance = 0;\n      Map<ComparableNode, Integer> votesMap = new HashMap<ComparableNode, Integer>();\n      if (targetIndex < 0) {\n        for (int i = 0; i < nodesArray.length; i++) {\n          if (nodesArray[i].equals(target)) {\n            targetIndex = i;\n            break;\n          }\n        }\n      }\n      while (NeuralNetManager.instance(thread).hasNext()) {\n        Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n        int distance = 0;\n        for (int i = 0; i < nodesArray.length; i++) {\n          if (!target.equals(nodesArray[i])) {\n            int result = target.compare(nodesArray[i]);\n            if (result != -1) {\n              ++distance;\n            }\n            comparisonCache[targetIndex][i] = new Integer(result);\n            comparisonCache[i][targetIndex] = new Integer(result * (-1));\n          }\n        }\n        TrainingData trainingData = new TrainingData(target);\n        ComparableNode tmp = best(nodesArray, comparisonCache, null, trainingData);\n        if (tmp != null) {\n          fallback = tmp;\n        }\n        if (trainingData.best != null) {\n          if (!votesMap.containsKey(trainingData.best)) {\n            votesMap.put(trainingData.best, new Integer(1));\n          } else {\n            votesMap.put(trainingData.best,\n                new Integer(votesMap.get(trainingData.best).intValue() + 1));\n          }\n        }\n        distance = (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n        NeuralNetManager.instance(thread).next();\n        if (trainingData.winner) {\n          ++votes;\n        }\n        if (votes == majority) {\n          won = true;\n          break;\n        }\n        distances[curDistance++] = distance;\n      }\n      NeuralNetManager.instance(thread).resetNext();\n      if (!won) {\n        int maxVotes = 0;\n        ComparableNode maxComparableNode = null;\n        for (Map.Entry<ComparableNode, Integer> entry : votesMap.entrySet()) {\n          if (entry.getValue().intValue() == maxVotes) {\n            maxComparableNode = null;\n          } else if (entry.getValue().intValue() > maxVotes) {\n            maxVotes = entry.getValue().intValue();\n            maxComparableNode = entry.getKey();\n          }\n        }\n        if (maxComparableNode == null) {\n          maxComparableNode = fallback;\n        }\n        if (!target.equals(maxComparableNode)) {\n          int totalDistance = 0;\n          Arrays.sort(distances);\n          for (int i = 0; i < majority; i++) {\n            totalDistance += distances[i];\n          }\n          score += totalDistance + SCORE_PENALTY;\n        }\n      }\n    } else {\n      int distance = 0;\n      Integer[][] comparisonCache = new Integer[nodesArray.length][nodesArray.length];\n      if (targetIndex < 0) {\n        for (int i = 0; i < nodesArray.length; i++) {\n          if (nodesArray[i].equals(target)) {\n            targetIndex = i;\n            break;\n          }\n        }\n      }\n      for (int i = 0; i < nodesArray.length; i++) {\n        if (!target.equals(nodesArray[i])) {\n          int result = target.compare(nodesArray[i]);\n          if (result != -1) {\n            ++distance;\n          }\n          comparisonCache[targetIndex][i] = new Integer(result);\n          comparisonCache[i][targetIndex] = new Integer(result * (-1));\n        }\n      }\n      TrainingData trainingData = new TrainingData(target);\n      best(nodesArray, comparisonCache, null, trainingData);\n      score += (distance - trainingData.winnerDistance) + trainingData.finalMisses;\n      score += trainingData.winner ? 0 : SCORE_PENALTY;\n    }\n    return score;\n  }\n\n  private static ComparableNode[] performInternal(final Element body, final int page,\n      final HtmlNode matchResult, final HtmlNode matchParent, final Collection<Node> ignore, int thread) {\n    final Map<Node, ComparableNode> nodes = new HashMap<Node, ComparableNode>();\n    if (body != null) {\n      body.traverse(new NodeVisitor() {\n        @Override\n        public void head(Node node, int depth) {\n          int nonEmptyChildren = 0;\n          for (Node child : node.childNodes()) {\n            if (!NodeUtil.isEmpty(child)) {\n              nonEmptyChildren++;\n            }\n          }\n          if (!NodeUtil.isEmpty(node)\n              && NodeUtil.isContent(node, matchResult, matchParent) && nonEmptyChildren > 0) {\n            nodes.put(node, new ComparableNode(node, matchResult, matchParent, thread));\n          }\n        }\n\n        @Override\n        public void tail(Node node, int depth) {}\n      });\n    }\n    return nodes.values().toArray(new ComparableNode[0]);\n  }\n\n  public static class Cache {\n    public ComparableNode[] nodesCache = null;\n    public Integer[][][] comparisonCache = null;\n  }\n\n  public static List<Node> perform(Element body, int page, Collection<Node> ignore,\n      HtmlNode matchResult, HtmlNode matchParent, Cache cache, int thread) {\n    Map<ComparableNode, Integer> votes = new LinkedHashMap<ComparableNode, Integer>();\n    if (cache == null) {\n      cache = new Cache();\n    }\n    if (cache.nodesCache == null) {\n      cache.nodesCache = performInternal(body, page, matchResult, matchParent, ignore, thread);\n      cache.comparisonCache = new Integer[NeuralNetManager.instance(thread).multiSize()]\n          [cache.nodesCache.length][cache.nodesCache.length];\n    }\n    final int majority = (NeuralNetManager.instance(thread).multiSize() / TWICE) + 1;\n    Node best = null;\n    int cur = 0;\n    NeuralNetManager.instance(thread).resetNext();\n    while (NeuralNetManager.instance(thread).hasNext()) {\n      ComparableNode winner = best(cache.nodesCache, cache.comparisonCache[cur++],\n          new HashSet<Node>(ignore), null);\n      NeuralNetManager.instance(thread).next();\n      if (winner != null) {\n        if (!votes.containsKey(winner)) {\n          votes.put(winner, new Integer(1));\n        } else {\n          votes.put(winner, new Integer(votes.get(winner).intValue() + 1));\n        }\n        if (votes.get(winner).intValue() == majority) {\n          best = winner.node();\n          break;\n        }\n      }\n    }\n    if (best == null) {\n      int bestVotes = 0;\n      List<Node> bestNodes = new ArrayList<Node>();\n      for (Map.Entry<ComparableNode, Integer> entry : votes.entrySet()) {\n        int val = entry.getValue().intValue();\n        if (val >= bestVotes) {\n          if (val > bestVotes) {\n            bestVotes = val;\n            bestNodes.clear();\n          }\n          bestNodes.add(entry.getKey().node());\n        }\n      }\n      return bestNodes;\n    }\n    return Arrays.asList(new Node[] { best });\n  }\n}\n'