Skip to content

Instantly share code, notes, and snippets.

@jdorrance
Created October 28, 2013 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdorrance/7205540 to your computer and use it in GitHub Desktop.
Save jdorrance/7205540 to your computer and use it in GitHub Desktop.
(ns com.pennwell.wcm.services.importers.APNewsBulkImporter
(:import [com.pennwell.wcm.services.taxonomy.bulk BulkTaggingService]
[com.pennwell.wcm.services.importers APNewsBulkImporter]
[java.io File]
[java.net URL]
[javax.imageio ImageIO])
(:use [clojure.data.json :only [read-json json-str]]
[clojure.pprint :only [pprint]]
[clojure.tools.logging :only (info error debug warn)]
[com.pennwell.wcm.services.importers.core :only [now
format-date
add-minutes
add-hours
add-days
add-months
add-years
year
month
day
year-path
month-path
day-path
month-name
beacon
exists?
activate
post-to-sling
create-if-needed
sanitize-headline
sanitize-title
delete-from-sling
author-json]])
(:require [clj-http [client :as client]]
[clojure [string :as str] [data :as data]]
[clojure.data [xml :as xml]])
(:gen-class
:methods
[["writeAPNewsPages" [
^{org.apache.camel.Header "path"} java.lang.String
^{org.apache.camel.Header "feedUrl"} java.lang.String
^{org.apache.camel.Header "rampSite"} java.lang.String
^{org.apache.camel.Header "branding"} java.lang.String
]
void]
["writeAPNewsPagesMap" [ java.util.Map ] void]]
:init init
:state state
:constructors {[Object Object Object] []}))
(comment
(def me (.getBean com.pennwell.spring/*context* "apWireNewsImporter"))
(def tigger (:tagging-service (.state me)))
(.tag tigger "<p>this is a camera infrared test of turnkey vision systems</p>" "vsd" 1)
(client/post
"http://localhost:4502/content/pennwellqa/en/infinite-madness/2013/08/02/eu-raids-telecoms-firms-in-internet-probe/jcr:content/leftcolumn/article/headerimage"
{:multipart [{:name "./file" :content (clojure.java.io/file "/Volumes/UserData/Users/wwood/Desktop/illusion.png")}
{:name "_charset_" :content "utf-8"}] :basic-auth ["admin" "admin"]})
)
(defn -init
"Save the references passed to us via constructor injection."
[author-host {:strs [username password]} tagging-service]
[[] {:author-host author-host
:auth [username password]
:tagging-service tagging-service}])
(defn remove-seq-and-since [url]
(str/replace (str/trim url) #"&(?:ConsumerReady|sequenceNumber|minDateTime)=[^&]*" ""))
(declare photo-caption get-text byline get-title)
(defn content-hash
"Returns the sha1 hash (as a hex string) of the input string. This will be stored
with the article upon import and then used for determining whether the text from L/N has
changed since the article was imported."
[item]
(let [s (str ((juxt photo-caption get-text byline get-title) item))]
(apply str
(map
#(.substring
(Integer/toString
(+ (bit-and % 0xff) 0x100) 16) 1)
(.digest (java.security.MessageDigest/getInstance "sha1") (.getBytes s))))))
(defn up-to-date? [item path {:keys [author-host auth] :as config}]
(try (let [hash (get (author-json (str path "/jcr:content.json") config) :pw:hash "")]
(info "comparing hashes" (content-hash item) (.trim hash))
(= (content-hash item) (.trim hash)))
(catch
Exception e
(when-not (and e (.getMessage e) (.contains (.getMessage e) "404"))
(error e))
false)))
(defn management-id [entry]
(-> entry :NewsManagement :ManagementId))
(defn management-seq [entry]
(-> entry :NewsManagement :ManagementSequenceNumber Integer/valueOf))
(defn pub-status [entry]
"Usable, Embargoed, Withheld or Canceled"
(-> entry :NewsManagement :PublishingStatus))
(defn pubdate [entry]
(let [s (-> entry :NewsManagement :PublishingReleaseDateTime)]
(try (.. (java.text.SimpleDateFormat. "yyyy-MM-dd'T'HH:mm:ssZ")
(parse (str/replace s "Z" "-0000")))
(catch Exception e (info s "does not match format" "yyyy-MM-dd'T'HH:mm:ss") (now)))))
(defn distinct-keys? [maps]
(and
(seq maps)
(every? map? maps)
(apply distinct? (apply concat (map keys maps)))))
(defn indistinct-keys? [maps]
(and
(seq maps)
(every? map? maps)
(apply (complement distinct?) (apply concat (map keys maps)))))
(defn maybe [f] (fn [x] (when x (f x))))
(defn comp? [& fs] (apply comp (map maybe fs)))
(defprotocol JDom (from-jdom [e]))
(extend-protocol JDom
org.jdom.Element (from-jdom [e]
(let [k (keyword (.getName e))
content (filter identity (seq (map from-jdom (.getContent e))))
attributes (into {} (map from-jdom (.getAttributes e)))
v (cond->> content
(seq attributes) (cons attributes)
(and (not (seq attributes)) (every? string? content)) str/join
(distinct-keys? content) (into {})
(indistinct-keys? content) ((comp (partial apply concat) (partial map seq))))]
{k v}))
org.jdom.Attribute (from-jdom [a] [(.getName a) (.getValue a)])
org.jdom.Text (from-jdom [t] (let [result (.getText t)] (when (not (str/blank? result)) result)))
org.jdom.CDATA (from-jdom [t] (let [result (.getText t)] (when (not (str/blank? result)) result)))
java.lang.Object (from-jdom [t] (str t))
org.jdom.Document (from-jdom [e] (from-jdom (.getRootElement e))))
(defn fetch-feed [url]
(:body (client/get url {:basic-auth ["penwell_webfeeds" "@Pwf$61614"]})))
(defn check-for [goal field]
(re-seq (re-pattern (str ".*" goal ".*"))
field))
(defn print-matches [goal field]
(doseq [x (check-for goal field)] (println x)))
(defn parse-feed [url]
(with-open [r (java.io.StringReader. (fetch-feed url))]
(->
(org.jdom.input.SAXBuilder.)
(.build r)
from-jdom
:feed)))
(defn select-or-get
([k coll]
(or
(k coll)
(mapcat val (filter (fn [[t & _]] (= t k)) coll))))
([k attrs coll]
(try
(or
(k coll)
(for [x coll :when (and (instance? clojure.lang.MapEntry x)
(= (first x) :hl2)
(= (first (last x)) attrs))]
(last (last x))))
(catch Exception e))))
(defn select-or-first [k coll]
(or
(k coll)
(first (map val (filter (fn [[t & _]] (= t k)) coll)))))
(defn select [k coll]
(map val (filter (fn [[t & _]] (= t k)) coll)))
(defn photo-id [role entry]
(let [[[_ id]]
(for [[{:strs [name value id]}]
(->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-metadata) seq)
:when (= [name value] ["Role" role])]
(str/split id #":"))]
id))
(defn photo-info [role entry]
(first
(for [[{:strs [id] :as info}]
(->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-reference) seq)
:when (and id (.contains id (photo-id role entry)))]
info)))
(defn photo-url [role entry] (get (photo-info role entry) "source"))
(defn photo-type [role entry] (get (photo-info role entry) "coding"))
(defn photo-caption [entry]
(try (->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-caption) first :p str .trim)
(catch Exception e (error e) "")))
(defn save-image [url ext]
(let [temp-file (File/createTempFile "apnews-" (str "." (name ext)))]
(ImageIO/write (ImageIO/read (URL. url)) (name ext) temp-file)
temp-file))
(defn render-a [m] (let [{:keys [a]} m [{:strs [href]} a] a] (str "<a href='" href "'>" a "</a>")))
(defn get-text [entry]
(->> entry :content :nitf :body :body.content (select-or-get :block) (select :p)
(map #(str "<p>" (cond
(string? %) %
(sequential? %) (str/join (map (fn [x] (cond (string? x) x (:a x) (render-a x))) %))
:else (do (info "paragraph?" (pr-str %)) ""))
"</p>"))
str/join))
(defn byline [entry]
(->> entry :ContentMetadata (select :ByLine) (map (partial filter string?)) flatten str/join))
(defn get-author-name [entry] (str/replace-first (byline entry) #"[Bb]y " ""))
(defn get-title [entry]
(->> entry :content :nitf :body :body.head (#(or (:hedline %) (select :hedline %))) (select-or-get :hl2 {"id" "originalHeadline"}) (filter string?) str/join))
(defn call-ramp [item {:keys [tagging-service ramp-site] :as config}]
(try
(let [result (into {} (.tag tagging-service (get-text item) ramp-site 1))]
(info "ramp tagging succeeded" ramp-site (get-title item) result)
result)
(catch Exception e (info "ramp tagging failed" ramp-site) {})))
(defn get-entries [feed]
(map (partial into {}) (select :entry feed)))
(def sample-url
"http://syndication.ap.org/AP.Distro.Feed/GetFeed.aspx?idList=31995,32008,32005,32003&idListType=products&maxItems=25&showInlineLinks=true&fullcontent=nitf")
;; not sure what to use for synopsis... just using first paragraph of text for now.
;; Searching the AP Webfeeds manual for "synopsis" reveals that they just use synopsis as synonym for headline.
(defn synopsis [entry]
(->> entry :content :nitf :body :body.content (select-or-get :block) (select :p) (take 1) (map #(str "<p>" (if % % "") "</p>")) str/join))
(defn taggable-text [branding record]
(str "<p>" (byline record) "</p>" " " (get-text record) branding))
(defn ap-article-xml-to-cq-json [record ramp-results branding beacon]
(let [publication-date (pubdate record)
result
{:jcr:primaryType "cq:Page"
:jcr:content
{:cq:tags (into [] (get ramp-results "cq:tags" ""))
:pw:domain "article"
:offTime (format-date (-> publication-date (add-days 90) (add-hours 12) (add-minutes (- (rand-int 60) 30))))
:categoryThreshold "1"
:sling:resourceType "pwwcm/components/page/article"
:jcr:primaryType "cq:PageContent"
:cq:template "/apps/pwwcm/templates/article"
:pw:hash (content-hash record)
:pw:beacon beacon
:pw:apid (management-id record)
:leftcolumn
{:sling:resourceType "foundation/components/parsys"
:jcr:primaryType "nt:unstructured"
:article
{:synopsis (synopsis record)
:taggabletext
{
:text (taggable-text branding record)
:jcr:primaryType "nt:unstructured"
}
:jcr:primaryType "nt:unstructured"
:headerimage
{
:enlarge "false"
:alt (photo-caption record)
:imageRotate "0"
:jcr:primaryType "nt:unstructured"
:jcr:description (photo-caption record)
:sling:resourceType "pwwcm/components/content/common/images/clicktoenlargeimage"
}
:thumbnailimage
{
:imageRotate "0"
:jcr:primaryType "nt:unstructured"
:sling:resourceType "foundation/components/image"
}
:ratings
{:sling:resourceType "collab/commons/components/ratings"
:jcr:primaryType "nt:unstructured"}
}
:comments
{:sling:resourceType "collab/commons/components/comments"
:jcr:primaryType "nt:unstructured"}
}
:pw:publicationDate (format-date publication-date)
:jcr:mixinTypes ["mix:versionable"]
:jcr:title (sanitize-title (get-title record))
:topiccenterdefault (into [] (get ramp-results "topiccenterdefault" ""))
}
}
author-name (get-author-name record)]
(if (str/blank? author-name) result (update-in result [:jcr:content :leftcolumn :article] assoc :authorname author-name :showAuthor true))))
(defn update-seq-and-since [path sequence-number since config]
(post-to-sling (str path "/jcr:content")
"last-import"
{:since since :sequence-number sequence-number :jcr:primaryType "nt:unstructured"}
identity config))
(defn sequencing [feed key-name value-name]
(-> (for [[x]
(->> feed
(select :Property)
first
(select :Property)
(filter (fn [[y]] (= y ["Name" "FeedSequencing"])))
first
(select :Property))
:when (= (get x "Name") key-name)]
x)
first
(get value-name)))
(comment (client/post
"http://localhost:4502/content/pennwellqa/en/infinite-madness/2013/08/02/eu-raids-telecoms-firms-in-internet-probe/jcr:content/leftcolumn/article/headerimage"
{:multipart [{:name "./file" :content (clojure.java.io/file "/Volumes/UserData/Users/wwood/Downloads/apache-tomcat-7.0.37/temp/apnews-7813838245316703544.jpg")}
{:name "_charset_" :content "utf-8"}] :basic-auth ["admin" "admin"]}))
(defn post-article-image [image-type path entry config]
(let [ap-name (image-type {:header "Preview" :thumbnail "Thumbnail"})
cq-name (image-type {:header "/jcr:content/leftcolumn/article/headerimage"
:thumbnail "/jcr:content/leftcolumn/article/thumbnailimage"})]
(when-let [source-url (photo-url ap-name entry)]
(when-let [ext (photo-type ap-name entry)]
(when-let [temp-file (save-image source-url ext)]
(try
(let [url (str (:author-host config) path cq-name)
ap-id (str/trim (str (photo-id ap-name entry)))
existing-ap-id (str/trim (str (:pw:apPhotoId (author-json (str path cq-name ".0.json") config))))]
(if (= ap-id existing-ap-id)
(info "photo unchanged, not reposting")
(do
(info "posting" (name image-type) "image" temp-file "to" url)
(client/post url
{:multipart [{:name "./file" :content temp-file}
{:name "_charset_" :content "utf-8"}]
:basic-auth (:auth config)})
(info "posting photo id" ap-id)
(client/post url
{:form-params {:pw:apPhotoId ap-id}
:basic-auth (:auth config)}))))
(catch Exception e (error e))
(finally (.delete temp-file))))))))
(defn import-entry [time-of-import path entry branding import-count {:keys [base-path] :as config}]
(try
(let [ramp-results (call-ramp entry config)]
(when-let [title (get-title entry)]
(when-not (or (str/blank? title)
(str/blank? (sanitize-headline title))
(< (count (get-text entry)) 120))
(info "starting import of" title)
(post-to-sling
path
(sanitize-headline title)
(ap-article-xml-to-cq-json entry ramp-results branding (beacon base-path time-of-import))
identity
config)
(doseq [image-type [:header :thumbnail]]
(post-article-image image-type
(str path "/" (sanitize-headline title))
entry
config))
(activate (str path "/" (sanitize-headline title)) config)
(swap! import-count inc))))
(catch Exception e (error e))))
(defn import-existing-entry [time-of-import existing entry branding import-count {:keys [author-host auth] :as config}]
(try
(let [content-node-path (:path existing)
path (str/replace-first content-node-path "/jcr:content" "")
taggable-text-node-path (str content-node-path "/leftcolumn/article/taggabletext")
header-image-node-path (str content-node-path "/leftcolumn/article/headerimage")
article-node-path (str content-node-path "/leftcolumn/article")]
(info "checking hash")
(if (up-to-date? entry path config)
(info path "is already up to date based on content-hash, not importing")
(when-let [title (get-title entry)]
(when-not (or (str/blank? title)
(str/blank? (sanitize-headline title))
(< (count (get-text entry)) 120))
(info "starting import of existing entry" title)
(let [ramp-results (call-ramp entry config)]
(client/post (str author-host content-node-path)
{
:basic-auth auth
:form-params {
:jcr:title (sanitize-title (get-title entry))
:cq:tags (into [] (get ramp-results "cq:tags" ""))
:topiccenterdefault (into [] (get ramp-results "topiccenterdefault" ""))
:pw:hash (content-hash entry)
"_charset_" "utf-8"
}
}))
(client/post (str author-host taggable-text-node-path)
{
:basic-auth auth
:form-params {
:text (taggable-text branding entry)
"_charset_" "utf-8"
}
})
(client/post (str author-host header-image-node-path)
{
:basic-auth auth
:form-params {
:jcr:description (photo-caption entry)
:alt (photo-caption entry)
"_charset_" "utf-8"
}
})
(client/post (str author-host article-node-path)
{
:basic-auth auth
:form-params {
:synopsis (synopsis entry)
"_charset_" "utf-8"
}
})
(doseq [image-type [:header :thumbnail]]
(post-article-image image-type
path
entry
config))
(activate path config)
(swap! import-count inc)))))
(catch Exception e (error e))))
(defn find-existing-news [{:keys [author-host auth base-path]}]
(let [format-string "%s/etc.query.json?statement=/jcr:root/content//*[@pw:beacon='%s']&property=pw:hash&property=pw:apid&rows=10000"
this-month-url (format format-string author-host (beacon base-path (now)))
last-month-url (format format-string author-host (beacon base-path (add-months (now) -1)))
month-before-last-url (format format-string author-host (beacon base-path (add-months (now) -2)))
this-month-results (read-json (:body (client/get this-month-url {:basic-auth auth})))
last-month-results (read-json (:body (client/get last-month-url {:basic-auth auth})))
month-before-last-results (read-json (:body (client/get month-before-last-url {:basic-auth auth})))
query-results (concat this-month-results last-month-results month-before-last-results)]
(info "checking for existing articles from this month at " this-month-url)
(info "checking for existing articles from last month at " last-month-url)
(info "checking for existing articles from month before last at " month-before-last-url)
(into {}
(for [{:keys [^String pw:hash ^String pw:apid ^String jcr:path]} query-results :when (.startsWith jcr:path "/content")]
(do (when-not pw:apid (info "existing article without id " jcr:path))
{pw:apid {:hash pw:hash :path jcr:path}})))))
(defn parent-path [s] (subs s 0(.lastIndexOf s (int \/))))
(defn label [s] (subs s (inc (.lastIndexOf s (int \/)))))
(defn write-ap-news-pages [url branding time-of-import config]
(let [path (:base-path config)
feed (parse-feed url)
entries (get-entries feed)
ninety-days-earlier (add-days time-of-import -90)
import-count (atom 0)
existing-news-map (try (find-existing-news config)
(catch Exception e {}))
ap-ids (map management-id entries)
new-ids (remove existing-news-map ap-ids)
grouped-entries (group-by management-id entries)]
(info "url=" url)
(info "existing count = " (count existing-news-map))
(info "new count = " (count new-ids))
(create-if-needed path
(year time-of-import)
(year time-of-import)
true
config)
(create-if-needed (year-path path time-of-import)
(month time-of-import)
(month-name time-of-import)
false
config)
(create-if-needed (month-path path time-of-import)
(day time-of-import)
(day time-of-import)
false
config)
;(delete-from-sling (day-path path ninety-days-earlier) config)
;(delete-from-sling (month-path path (add-months ninety-days-earlier -1)) config)
;(delete-from-sling (year-path path (add-years ninety-days-earlier -1)) config)
(doseq [[ap-id grouped-entry] grouped-entries
:let [sequence-number (apply max (map management-seq grouped-entry))
entry (first (filter #(= sequence-number (management-seq %)) grouped-entry))]]
(info "ap-id=" ap-id)
(info "versions=" (count grouped-entry))
(info "entry=" (management-id entry) (management-seq entry))
(if-let [existing (existing-news-map ap-id)]
(do (import-existing-entry time-of-import existing entry branding import-count config)
(info "updated existing entry at " (:path existing) (:hash existing) (content-hash entry)))
(import-entry time-of-import (day-path path time-of-import) entry branding import-count config)))
(info "updating sequenceNumber and minDateTme")
(if (> @import-count 0)
(update-seq-and-since path
(sequencing feed "sequenceNumber" "Id")
(sequencing feed "minDateTime" "Value")
config)
(info "no news is good news"))
(info "ap news import complete")))
(defn write-ap-news-pages-since-last [this path url ramp-site branding time-of-import]
(info "the apNewsWriter=" this)
(info "path=" path)
(info "url=" url)
(info "ramp-site=" ramp-site)
(info "branding=" branding)
(let [config (assoc (.state this) :base-path path :ramp-site ramp-site :auto-activate? true)
{:keys [sequence-number since]} (try (author-json (str path "/jcr:content/last-import.json") config)
(catch Exception e {:sequence-number nil :since nil}))
url (if-not sequence-number url (str (remove-seq-and-since url) "&sequenceNumber=" sequence-number "&minDateTime=" since))
url (str url "&ConsumerReady=TRUE")]
(write-ap-news-pages url branding time-of-import config)))
(defn -writeAPNewsPagesMap [this m]
(info "m=" m)
(write-ap-news-pages-since-last this (get m "path") (get m "feedUrl") (get m "rampSite") (get m "branding") (now)))
(defn -writeAPNewsPages [this path url ramp-site branding]
(write-ap-news-pages-since-last this path url ramp-site branding (now)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment