AndyMoreland/profiling

## gistfile1.clj
(ns desktop.htmlparser.core
  (:require [clojure.contrib.string :as st]
            [infer.measures :only [sparse-dot-product] :as im]
            [clojure.contrib.profile :as p])
  (:import [org.htmlcleaner HtmlCleaner DomSerializer]
           [org.apache.commons.lang StringEscapeUtils]
           [org.w3c.dom Document Node]))

                                        ; TODO: make sure there are spaces after punctuation
                                        ; TODO: Replace &apos; correctly


(defn do-children [node func]
  "Analagous to running #map over a collection of children"
  (if (not (and node (.hasChildNodes node)))
    []
    (let [children (.getChildNodes node)]
      (doall (for [i (range 0 (.getLength children))]
               (p/prof :inside (func (.item children i))))))))

(defn children-matching [node pred]
  "Returns children matching a predicate"
  (p/prof :children-matching (filter pred (do-children node identity))))

(defn count-words [string]
  (count (st/split #" " string)))


(def bad-words
     ; Classes/IDs matching these are "bad"
     #"comment|comments|com|reviews|review|footer|meta|footnote|foot|navigation|side|sidebar|nav|container|module|answer")

(def good-words
     ; Classes/IDs matching these are "good"
     #"article|post|hentry|entry|content|text|body|article|main|post-text|entry-text")
(def punctuation
     #"[^A-Za-z0-9]")

(def div-weight-map {
                     :child-paragraphs 150
                     :bad-id -150
                     :bad-class -150
                     :good-id 75
                     :good-class 75
                     :num-chars 0.1
                     :commas 1
                     :inner-divs -50
                     :long-text? 100.0 } )

(defn weight-div [node]
  "Takes an HTML node and returns a map containing numeric 'facts' about the div that are important; will be cross-producted with the weight map above"
  (let [class (.getAttribute node "class")
        id (.getAttribute node "id")
        text-content (.getTextContent node)]
    {:child-paragraphs
     (p/prof :child-paragraphs (count (children-matching node #(= (.getNodeName %) "p"))))

     :bad-id
     (p/prof :bad-id (count (re-seq bad-words id)))

     :bad-class
     (p/prof :bad-class (count (re-seq bad-words class)))

     :good-id
     (p/prof :good-id (count (re-seq good-words id)))

     :good-class
     (p/prof :good-class (count (re-seq good-words class)))

                                        ;     :num-chars
                                        ;    (p/prof :num-chars (count text-content))

 ;    :commas
;     (p/prof :commas (count (filter #{\,} text-content)))

                                        ;:general-punctuation
                                        ;(p/prof :general-punctuation (count (re-seq punctuation (.getTextContent node))))

     :inner-divs
     (p/prof :inner-divs (count (children-matching node #(= (.getNodeName %) "div"))))

     :long-text?
     (p/prof :long-text? (if (> (count text-content) 30) 1.0 0.0))}))

(defn initialize-cleaner-props [cleaner]
  "Takes an HTMLCleaner object and makes it ignore comments, script tags, and style tags"
  (doto (.getProperties cleaner)
    (.setOmitComments true)
    (.setPruneTags "script,style")))

(defn elements [^Document dom ^String tag]
  "Returns a collection of elements matching the given tag name in the dom"
  (let [nodes (.getElementsByTagName dom tag)]
    (map identity
         (for [i (range 0 (.getLength nodes))]
           (.item nodes i)))))


(defn remove-children-matching [^Node node pred]
  "Removes children matching a predicate and returns the node"
  (doseq [child (children-matching node pred)] (.removeChild node child))
  node)

(defn attribute [^Node node ^String name]
  (let [attributes (.getAttributes node)
        attribute (if attributes (.getNamedItem attributes name) nil)]
    (if attribute (.getValue attribute) "")))

(defn dom [url]
  "Returns cleaned html source given url"
  (let [
        cleaner (HtmlCleaner.)
        props (initialize-cleaner-props cleaner)
        node (.clean cleaner (slurp url))
        serializer (DomSerializer. props)
        ]
    (.createDOM serializer node)))

;;;;;; These are the two functions that should really be called


(defn clean-content-divs [divs]
  (doseq [div divs]
    (remove-children-matching div
                              #(not (= 0
                                       (+ (count (re-seq bad-words (attribute % "class")))
                                          (count (re-seq bad-words (attribute % "id")))))))))

(defn find-content-div [dom]
  "Returns the node that most likely to contain the majority of the content of the page"
  (let [divs (elements dom "div")]
    (if (not (= 0 (count divs)))
      (let [content-div (apply max-key
                               (fn [div] (infer.measures/sparse-dot-product (weight-div div) div-weight-map))
                               divs)]
        (if (> (count (.getTextContent content-div)) 50) content-div nil)) nil)))


(defn clean-content [^String content]
  (p/prof :clean-content (st/lower-case (st/replace-re punctuation " " (st/replace-re #"&apos;" "'" (st/replace-re #"(\n|\r)+" " " (StringEscapeUtils/unescapeHtml content)))))))

(defn find-content [^String url]
  (let [content-div (find-content-div (dom url))]
    (if content-div (clean-content (.getTextContent content-div)) nil)))

(defn words [^String string]
  (st/split #" " string))

## profiling
             Name      mean       min       max     count       sum
        bad-class      9285         0     98000      3156  29305000
           bad-id      4298         0     65000      3156  13567000
 child-paragraphs    268490     53000  14012000      3156  847356000
children-matching    177528     38000   8650000      6312  1120559000
    clean-content   2953000   2953000   2953000         1   2953000
           commas  16994878      7000  96418000      3156  53635837000
       good-class      4702         0     61000      3156  14841000
          good-id      1426         0     43000      3156   4502000
       inner-divs    290466     58000  13731000      3156  916712000
           inside     17265      7000   5642000     36694  633551000
       long-text?      2604         0     45000      3156   8221000
"Elapsed time: 58654.433 msecs"
	(ns desktop.htmlparser.core
	(:require [clojure.contrib.string :as st]
	[infer.measures :only [sparse-dot-product] :as im]
	[clojure.contrib.profile :as p])
	(:import [org.htmlcleaner HtmlCleaner DomSerializer]
	[org.apache.commons.lang StringEscapeUtils]
	[org.w3c.dom Document Node]))

	; TODO: make sure there are spaces after punctuation
	; TODO: Replace ' correctly


	(defn do-children [node func]
	"Analagous to running #map over a collection of children"
	(if (not (and node (.hasChildNodes node)))
	[]
	(let [children (.getChildNodes node)]
	(doall (for [i (range 0 (.getLength children))]
	(p/prof :inside (func (.item children i))))))))

	(defn children-matching [node pred]
	"Returns children matching a predicate"
	(p/prof :children-matching (filter pred (do-children node identity))))

	(defn count-words [string]
	(count (st/split #" " string)))


	(def bad-words
	; Classes/IDs matching these are "bad"
	#"comment\|comments\|com\|reviews\|review\|footer\|meta\|footnote\|foot\|navigation\|side\|sidebar\|nav\|container\|module\|answer")

	(def good-words
	; Classes/IDs matching these are "good"
	#"article\|post\|hentry\|entry\|content\|text\|body\|article\|main\|post-text\|entry-text")
	(def punctuation
	#"[^A-Za-z0-9]")

	(def div-weight-map {
	:child-paragraphs 150
	:bad-id -150
	:bad-class -150
	:good-id 75
	:good-class 75
	:num-chars 0.1
	:commas 1
	:inner-divs -50
	:long-text? 100.0 } )

	(defn weight-div [node]
	"Takes an HTML node and returns a map containing numeric 'facts' about the div that are important; will be cross-producted with the weight map above"
	(let [class (.getAttribute node "class")
	id (.getAttribute node "id")
	text-content (.getTextContent node)]
	{:child-paragraphs
	(p/prof :child-paragraphs (count (children-matching node #(= (.getNodeName %) "p"))))

	:bad-id
	(p/prof :bad-id (count (re-seq bad-words id)))

	:bad-class
	(p/prof :bad-class (count (re-seq bad-words class)))

	:good-id
	(p/prof :good-id (count (re-seq good-words id)))

	:good-class
	(p/prof :good-class (count (re-seq good-words class)))

	; :num-chars
	; (p/prof :num-chars (count text-content))

	; :commas
	; (p/prof :commas (count (filter #{\,} text-content)))

	;:general-punctuation
	;(p/prof :general-punctuation (count (re-seq punctuation (.getTextContent node))))

	:inner-divs
	(p/prof :inner-divs (count (children-matching node #(= (.getNodeName %) "div"))))

	:long-text?
	(p/prof :long-text? (if (> (count text-content) 30) 1.0 0.0))}))

	(defn initialize-cleaner-props [cleaner]
	"Takes an HTMLCleaner object and makes it ignore comments, script tags, and style tags"
	(doto (.getProperties cleaner)
	(.setOmitComments true)
	(.setPruneTags "script,style")))

	(defn elements [^Document dom ^String tag]
	"Returns a collection of elements matching the given tag name in the dom"
	(let [nodes (.getElementsByTagName dom tag)]
	(map identity
	(for [i (range 0 (.getLength nodes))]
	(.item nodes i)))))


	(defn remove-children-matching [^Node node pred]
	"Removes children matching a predicate and returns the node"
	(doseq [child (children-matching node pred)] (.removeChild node child))
	node)

	(defn attribute [^Node node ^String name]
	(let [attributes (.getAttributes node)
	attribute (if attributes (.getNamedItem attributes name) nil)]
	(if attribute (.getValue attribute) "")))

	(defn dom [url]
	"Returns cleaned html source given url"
	(let [
	cleaner (HtmlCleaner.)
	props (initialize-cleaner-props cleaner)
	node (.clean cleaner (slurp url))
	serializer (DomSerializer. props)
	]
	(.createDOM serializer node)))

	;;;;;; These are the two functions that should really be called


	(defn clean-content-divs [divs]
	(doseq [div divs]
	(remove-children-matching div
	#(not (= 0
	(+ (count (re-seq bad-words (attribute % "class")))
	(count (re-seq bad-words (attribute % "id")))))))))

	(defn find-content-div [dom]
	"Returns the node that most likely to contain the majority of the content of the page"
	(let [divs (elements dom "div")]
	(if (not (= 0 (count divs)))
	(let [content-div (apply max-key
	(fn [div] (infer.measures/sparse-dot-product (weight-div div) div-weight-map))
	divs)]
	(if (> (count (.getTextContent content-div)) 50) content-div nil)) nil)))



	(defn clean-content [^String content]
	(p/prof :clean-content (st/lower-case (st/replace-re punctuation " " (st/replace-re #"'" "'" (st/replace-re #"(\n\|\r)+" " " (StringEscapeUtils/unescapeHtml content)))))))

	(defn find-content [^String url]
	(let [content-div (find-content-div (dom url))]
	(if content-div (clean-content (.getTextContent content-div)) nil)))

	(defn words [^String string]
	(st/split #" " string))
	Name mean min max count sum
	bad-class 9285 0 98000 3156 29305000
	bad-id 4298 0 65000 3156 13567000
	child-paragraphs 268490 53000 14012000 3156 847356000
	children-matching 177528 38000 8650000 6312 1120559000
	clean-content 2953000 2953000 2953000 1 2953000
	commas 16994878 7000 96418000 3156 53635837000
	good-class 4702 0 61000 3156 14841000
	good-id 1426 0 43000 3156 4502000
	inner-divs 290466 58000 13731000 3156 916712000
	inside 17265 7000 5642000 36694 633551000
	long-text? 2604 0 45000 3156 8221000
	"Elapsed time: 58654.433 msecs"