Created
November 18, 2010 03:13
-
-
Save AndyMoreland/704571 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns desktop.htmlparser.core | |
(:require [clojure.contrib.string :as st] | |
[infer.measures :only [sparse-dot-product] :as im] | |
[clojure.contrib.profile :as p]) | |
(:import [org.htmlcleaner HtmlCleaner DomSerializer] | |
[org.apache.commons.lang StringEscapeUtils] | |
[org.w3c.dom Document Node])) | |
; TODO: make sure there are spaces after punctuation | |
; TODO: Replace ' correctly | |
(defn do-children [node func] | |
"Analagous to running #map over a collection of children" | |
(if (not (and node (.hasChildNodes node))) | |
[] | |
(let [children (.getChildNodes node)] | |
(doall (for [i (range 0 (.getLength children))] | |
(p/prof :inside (func (.item children i)))))))) | |
(defn children-matching [node pred] | |
"Returns children matching a predicate" | |
(p/prof :children-matching (filter pred (do-children node identity)))) | |
(defn count-words [string] | |
(count (st/split #" " string))) | |
(def bad-words | |
; Classes/IDs matching these are "bad" | |
#"comment|comments|com|reviews|review|footer|meta|footnote|foot|navigation|side|sidebar|nav|container|module|answer") | |
(def good-words | |
; Classes/IDs matching these are "good" | |
#"article|post|hentry|entry|content|text|body|article|main|post-text|entry-text") | |
(def punctuation | |
#"[^A-Za-z0-9]") | |
(def div-weight-map { | |
:child-paragraphs 150 | |
:bad-id -150 | |
:bad-class -150 | |
:good-id 75 | |
:good-class 75 | |
:num-chars 0.1 | |
:commas 1 | |
:inner-divs -50 | |
:long-text? 100.0 } ) | |
(defn weight-div [node] | |
"Takes an HTML node and returns a map containing numeric 'facts' about the div that are important; will be cross-producted with the weight map above" | |
(let [class (.getAttribute node "class") | |
id (.getAttribute node "id") | |
text-content (.getTextContent node)] | |
{:child-paragraphs | |
(p/prof :child-paragraphs (count (children-matching node #(= (.getNodeName %) "p")))) | |
:bad-id | |
(p/prof :bad-id (count (re-seq bad-words id))) | |
:bad-class | |
(p/prof :bad-class (count (re-seq bad-words class))) | |
:good-id | |
(p/prof :good-id (count (re-seq good-words id))) | |
:good-class | |
(p/prof :good-class (count (re-seq good-words class))) | |
; :num-chars | |
; (p/prof :num-chars (count text-content)) | |
; :commas | |
; (p/prof :commas (count (filter #{\,} text-content))) | |
;:general-punctuation | |
;(p/prof :general-punctuation (count (re-seq punctuation (.getTextContent node)))) | |
:inner-divs | |
(p/prof :inner-divs (count (children-matching node #(= (.getNodeName %) "div")))) | |
:long-text? | |
(p/prof :long-text? (if (> (count text-content) 30) 1.0 0.0))})) | |
(defn initialize-cleaner-props [cleaner] | |
"Takes an HTMLCleaner object and makes it ignore comments, script tags, and style tags" | |
(doto (.getProperties cleaner) | |
(.setOmitComments true) | |
(.setPruneTags "script,style"))) | |
(defn elements [^Document dom ^String tag] | |
"Returns a collection of elements matching the given tag name in the dom" | |
(let [nodes (.getElementsByTagName dom tag)] | |
(map identity | |
(for [i (range 0 (.getLength nodes))] | |
(.item nodes i))))) | |
(defn remove-children-matching [^Node node pred] | |
"Removes children matching a predicate and returns the node" | |
(doseq [child (children-matching node pred)] (.removeChild node child)) | |
node) | |
(defn attribute [^Node node ^String name] | |
(let [attributes (.getAttributes node) | |
attribute (if attributes (.getNamedItem attributes name) nil)] | |
(if attribute (.getValue attribute) ""))) | |
(defn dom [url] | |
"Returns cleaned html source given url" | |
(let [ | |
cleaner (HtmlCleaner.) | |
props (initialize-cleaner-props cleaner) | |
node (.clean cleaner (slurp url)) | |
serializer (DomSerializer. props) | |
] | |
(.createDOM serializer node))) | |
;;;;;; These are the two functions that should really be called | |
(defn clean-content-divs [divs] | |
(doseq [div divs] | |
(remove-children-matching div | |
#(not (= 0 | |
(+ (count (re-seq bad-words (attribute % "class"))) | |
(count (re-seq bad-words (attribute % "id"))))))))) | |
(defn find-content-div [dom] | |
"Returns the node that most likely to contain the majority of the content of the page" | |
(let [divs (elements dom "div")] | |
(if (not (= 0 (count divs))) | |
(let [content-div (apply max-key | |
(fn [div] (infer.measures/sparse-dot-product (weight-div div) div-weight-map)) | |
divs)] | |
(if (> (count (.getTextContent content-div)) 50) content-div nil)) nil))) | |
(defn clean-content [^String content] | |
(p/prof :clean-content (st/lower-case (st/replace-re punctuation " " (st/replace-re #"'" "'" (st/replace-re #"(\n|\r)+" " " (StringEscapeUtils/unescapeHtml content))))))) | |
(defn find-content [^String url] | |
(let [content-div (find-content-div (dom url))] | |
(if content-div (clean-content (.getTextContent content-div)) nil))) | |
(defn words [^String string] | |
(st/split #" " string)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Name mean min max count sum | |
bad-class 9285 0 98000 3156 29305000 | |
bad-id 4298 0 65000 3156 13567000 | |
child-paragraphs 268490 53000 14012000 3156 847356000 | |
children-matching 177528 38000 8650000 6312 1120559000 | |
clean-content 2953000 2953000 2953000 1 2953000 | |
commas 16994878 7000 96418000 3156 53635837000 | |
good-class 4702 0 61000 3156 14841000 | |
good-id 1426 0 43000 3156 4502000 | |
inner-divs 290466 58000 13731000 3156 916712000 | |
inside 17265 7000 5642000 36694 633551000 | |
long-text? 2604 0 45000 3156 8221000 | |
"Elapsed time: 58654.433 msecs" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment