Created
October 28, 2013 21:57
-
-
Save jdorrance/7205540 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns com.pennwell.wcm.services.importers.APNewsBulkImporter | |
(:import [com.pennwell.wcm.services.taxonomy.bulk BulkTaggingService] | |
[com.pennwell.wcm.services.importers APNewsBulkImporter] | |
[java.io File] | |
[java.net URL] | |
[javax.imageio ImageIO]) | |
(:use [clojure.data.json :only [read-json json-str]] | |
[clojure.pprint :only [pprint]] | |
[clojure.tools.logging :only (info error debug warn)] | |
[com.pennwell.wcm.services.importers.core :only [now | |
format-date | |
add-minutes | |
add-hours | |
add-days | |
add-months | |
add-years | |
year | |
month | |
day | |
year-path | |
month-path | |
day-path | |
month-name | |
beacon | |
exists? | |
activate | |
post-to-sling | |
create-if-needed | |
sanitize-headline | |
sanitize-title | |
delete-from-sling | |
author-json]]) | |
(:require [clj-http [client :as client]] | |
[clojure [string :as str] [data :as data]] | |
[clojure.data [xml :as xml]]) | |
(:gen-class | |
:methods | |
[["writeAPNewsPages" [ | |
^{org.apache.camel.Header "path"} java.lang.String | |
^{org.apache.camel.Header "feedUrl"} java.lang.String | |
^{org.apache.camel.Header "rampSite"} java.lang.String | |
^{org.apache.camel.Header "branding"} java.lang.String | |
] | |
void] | |
["writeAPNewsPagesMap" [ java.util.Map ] void]] | |
:init init | |
:state state | |
:constructors {[Object Object Object] []})) | |
(comment | |
(def me (.getBean com.pennwell.spring/*context* "apWireNewsImporter")) | |
(def tigger (:tagging-service (.state me))) | |
(.tag tigger "<p>this is a camera infrared test of turnkey vision systems</p>" "vsd" 1) | |
(client/post | |
"http://localhost:4502/content/pennwellqa/en/infinite-madness/2013/08/02/eu-raids-telecoms-firms-in-internet-probe/jcr:content/leftcolumn/article/headerimage" | |
{:multipart [{:name "./file" :content (clojure.java.io/file "/Volumes/UserData/Users/wwood/Desktop/illusion.png")} | |
{:name "_charset_" :content "utf-8"}] :basic-auth ["admin" "admin"]}) | |
) | |
(defn -init | |
"Save the references passed to us via constructor injection." | |
[author-host {:strs [username password]} tagging-service] | |
[[] {:author-host author-host | |
:auth [username password] | |
:tagging-service tagging-service}]) | |
(defn remove-seq-and-since [url] | |
(str/replace (str/trim url) #"&(?:ConsumerReady|sequenceNumber|minDateTime)=[^&]*" "")) | |
(declare photo-caption get-text byline get-title) | |
(defn content-hash | |
"Returns the sha1 hash (as a hex string) of the input string. This will be stored | |
with the article upon import and then used for determining whether the text from L/N has | |
changed since the article was imported." | |
[item] | |
(let [s (str ((juxt photo-caption get-text byline get-title) item))] | |
(apply str | |
(map | |
#(.substring | |
(Integer/toString | |
(+ (bit-and % 0xff) 0x100) 16) 1) | |
(.digest (java.security.MessageDigest/getInstance "sha1") (.getBytes s)))))) | |
(defn up-to-date? [item path {:keys [author-host auth] :as config}] | |
(try (let [hash (get (author-json (str path "/jcr:content.json") config) :pw:hash "")] | |
(info "comparing hashes" (content-hash item) (.trim hash)) | |
(= (content-hash item) (.trim hash))) | |
(catch | |
Exception e | |
(when-not (and e (.getMessage e) (.contains (.getMessage e) "404")) | |
(error e)) | |
false))) | |
(defn management-id [entry] | |
(-> entry :NewsManagement :ManagementId)) | |
(defn management-seq [entry] | |
(-> entry :NewsManagement :ManagementSequenceNumber Integer/valueOf)) | |
(defn pub-status [entry] | |
"Usable, Embargoed, Withheld or Canceled" | |
(-> entry :NewsManagement :PublishingStatus)) | |
(defn pubdate [entry] | |
(let [s (-> entry :NewsManagement :PublishingReleaseDateTime)] | |
(try (.. (java.text.SimpleDateFormat. "yyyy-MM-dd'T'HH:mm:ssZ") | |
(parse (str/replace s "Z" "-0000"))) | |
(catch Exception e (info s "does not match format" "yyyy-MM-dd'T'HH:mm:ss") (now))))) | |
(defn distinct-keys? [maps] | |
(and | |
(seq maps) | |
(every? map? maps) | |
(apply distinct? (apply concat (map keys maps))))) | |
(defn indistinct-keys? [maps] | |
(and | |
(seq maps) | |
(every? map? maps) | |
(apply (complement distinct?) (apply concat (map keys maps))))) | |
(defn maybe [f] (fn [x] (when x (f x)))) | |
(defn comp? [& fs] (apply comp (map maybe fs))) | |
(defprotocol JDom (from-jdom [e])) | |
(extend-protocol JDom | |
org.jdom.Element (from-jdom [e] | |
(let [k (keyword (.getName e)) | |
content (filter identity (seq (map from-jdom (.getContent e)))) | |
attributes (into {} (map from-jdom (.getAttributes e))) | |
v (cond->> content | |
(seq attributes) (cons attributes) | |
(and (not (seq attributes)) (every? string? content)) str/join | |
(distinct-keys? content) (into {}) | |
(indistinct-keys? content) ((comp (partial apply concat) (partial map seq))))] | |
{k v})) | |
org.jdom.Attribute (from-jdom [a] [(.getName a) (.getValue a)]) | |
org.jdom.Text (from-jdom [t] (let [result (.getText t)] (when (not (str/blank? result)) result))) | |
org.jdom.CDATA (from-jdom [t] (let [result (.getText t)] (when (not (str/blank? result)) result))) | |
java.lang.Object (from-jdom [t] (str t)) | |
org.jdom.Document (from-jdom [e] (from-jdom (.getRootElement e)))) | |
(defn fetch-feed [url] | |
(:body (client/get url {:basic-auth ["penwell_webfeeds" "@Pwf$61614"]}))) | |
(defn check-for [goal field] | |
(re-seq (re-pattern (str ".*" goal ".*")) | |
field)) | |
(defn print-matches [goal field] | |
(doseq [x (check-for goal field)] (println x))) | |
(defn parse-feed [url] | |
(with-open [r (java.io.StringReader. (fetch-feed url))] | |
(-> | |
(org.jdom.input.SAXBuilder.) | |
(.build r) | |
from-jdom | |
:feed))) | |
(defn select-or-get | |
([k coll] | |
(or | |
(k coll) | |
(mapcat val (filter (fn [[t & _]] (= t k)) coll)))) | |
([k attrs coll] | |
(try | |
(or | |
(k coll) | |
(for [x coll :when (and (instance? clojure.lang.MapEntry x) | |
(= (first x) :hl2) | |
(= (first (last x)) attrs))] | |
(last (last x)))) | |
(catch Exception e)))) | |
(defn select-or-first [k coll] | |
(or | |
(k coll) | |
(first (map val (filter (fn [[t & _]] (= t k)) coll))))) | |
(defn select [k coll] | |
(map val (filter (fn [[t & _]] (= t k)) coll))) | |
(defn photo-id [role entry] | |
(let [[[_ id]] | |
(for [[{:strs [name value id]}] | |
(->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-metadata) seq) | |
:when (= [name value] ["Role" role])] | |
(str/split id #":"))] | |
id)) | |
(defn photo-info [role entry] | |
(first | |
(for [[{:strs [id] :as info}] | |
(->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-reference) seq) | |
:when (and id (.contains id (photo-id role entry)))] | |
info))) | |
(defn photo-url [role entry] (get (photo-info role entry) "source")) | |
(defn photo-type [role entry] (get (photo-info role entry) "coding")) | |
(defn photo-caption [entry] | |
(try (->> entry :content :nitf :body :body.content (select-or-first :media) (select :media-caption) first :p str .trim) | |
(catch Exception e (error e) ""))) | |
(defn save-image [url ext] | |
(let [temp-file (File/createTempFile "apnews-" (str "." (name ext)))] | |
(ImageIO/write (ImageIO/read (URL. url)) (name ext) temp-file) | |
temp-file)) | |
(defn render-a [m] (let [{:keys [a]} m [{:strs [href]} a] a] (str "<a href='" href "'>" a "</a>"))) | |
(defn get-text [entry] | |
(->> entry :content :nitf :body :body.content (select-or-get :block) (select :p) | |
(map #(str "<p>" (cond | |
(string? %) % | |
(sequential? %) (str/join (map (fn [x] (cond (string? x) x (:a x) (render-a x))) %)) | |
:else (do (info "paragraph?" (pr-str %)) "")) | |
"</p>")) | |
str/join)) | |
(defn byline [entry] | |
(->> entry :ContentMetadata (select :ByLine) (map (partial filter string?)) flatten str/join)) | |
(defn get-author-name [entry] (str/replace-first (byline entry) #"[Bb]y " "")) | |
(defn get-title [entry] | |
(->> entry :content :nitf :body :body.head (#(or (:hedline %) (select :hedline %))) (select-or-get :hl2 {"id" "originalHeadline"}) (filter string?) str/join)) | |
(defn call-ramp [item {:keys [tagging-service ramp-site] :as config}] | |
(try | |
(let [result (into {} (.tag tagging-service (get-text item) ramp-site 1))] | |
(info "ramp tagging succeeded" ramp-site (get-title item) result) | |
result) | |
(catch Exception e (info "ramp tagging failed" ramp-site) {}))) | |
(defn get-entries [feed] | |
(map (partial into {}) (select :entry feed))) | |
(def sample-url | |
"http://syndication.ap.org/AP.Distro.Feed/GetFeed.aspx?idList=31995,32008,32005,32003&idListType=products&maxItems=25&showInlineLinks=true&fullcontent=nitf") | |
;; not sure what to use for synopsis... just using first paragraph of text for now. | |
;; Searching the AP Webfeeds manual for "synopsis" reveals that they just use synopsis as synonym for headline. | |
(defn synopsis [entry] | |
(->> entry :content :nitf :body :body.content (select-or-get :block) (select :p) (take 1) (map #(str "<p>" (if % % "") "</p>")) str/join)) | |
(defn taggable-text [branding record] | |
(str "<p>" (byline record) "</p>" " " (get-text record) branding)) | |
(defn ap-article-xml-to-cq-json [record ramp-results branding beacon] | |
(let [publication-date (pubdate record) | |
result | |
{:jcr:primaryType "cq:Page" | |
:jcr:content | |
{:cq:tags (into [] (get ramp-results "cq:tags" "")) | |
:pw:domain "article" | |
:offTime (format-date (-> publication-date (add-days 90) (add-hours 12) (add-minutes (- (rand-int 60) 30)))) | |
:categoryThreshold "1" | |
:sling:resourceType "pwwcm/components/page/article" | |
:jcr:primaryType "cq:PageContent" | |
:cq:template "/apps/pwwcm/templates/article" | |
:pw:hash (content-hash record) | |
:pw:beacon beacon | |
:pw:apid (management-id record) | |
:leftcolumn | |
{:sling:resourceType "foundation/components/parsys" | |
:jcr:primaryType "nt:unstructured" | |
:article | |
{:synopsis (synopsis record) | |
:taggabletext | |
{ | |
:text (taggable-text branding record) | |
:jcr:primaryType "nt:unstructured" | |
} | |
:jcr:primaryType "nt:unstructured" | |
:headerimage | |
{ | |
:enlarge "false" | |
:alt (photo-caption record) | |
:imageRotate "0" | |
:jcr:primaryType "nt:unstructured" | |
:jcr:description (photo-caption record) | |
:sling:resourceType "pwwcm/components/content/common/images/clicktoenlargeimage" | |
} | |
:thumbnailimage | |
{ | |
:imageRotate "0" | |
:jcr:primaryType "nt:unstructured" | |
:sling:resourceType "foundation/components/image" | |
} | |
:ratings | |
{:sling:resourceType "collab/commons/components/ratings" | |
:jcr:primaryType "nt:unstructured"} | |
} | |
:comments | |
{:sling:resourceType "collab/commons/components/comments" | |
:jcr:primaryType "nt:unstructured"} | |
} | |
:pw:publicationDate (format-date publication-date) | |
:jcr:mixinTypes ["mix:versionable"] | |
:jcr:title (sanitize-title (get-title record)) | |
:topiccenterdefault (into [] (get ramp-results "topiccenterdefault" "")) | |
} | |
} | |
author-name (get-author-name record)] | |
(if (str/blank? author-name) result (update-in result [:jcr:content :leftcolumn :article] assoc :authorname author-name :showAuthor true)))) | |
(defn update-seq-and-since [path sequence-number since config] | |
(post-to-sling (str path "/jcr:content") | |
"last-import" | |
{:since since :sequence-number sequence-number :jcr:primaryType "nt:unstructured"} | |
identity config)) | |
(defn sequencing [feed key-name value-name] | |
(-> (for [[x] | |
(->> feed | |
(select :Property) | |
first | |
(select :Property) | |
(filter (fn [[y]] (= y ["Name" "FeedSequencing"]))) | |
first | |
(select :Property)) | |
:when (= (get x "Name") key-name)] | |
x) | |
first | |
(get value-name))) | |
(comment (client/post | |
"http://localhost:4502/content/pennwellqa/en/infinite-madness/2013/08/02/eu-raids-telecoms-firms-in-internet-probe/jcr:content/leftcolumn/article/headerimage" | |
{:multipart [{:name "./file" :content (clojure.java.io/file "/Volumes/UserData/Users/wwood/Downloads/apache-tomcat-7.0.37/temp/apnews-7813838245316703544.jpg")} | |
{:name "_charset_" :content "utf-8"}] :basic-auth ["admin" "admin"]})) | |
(defn post-article-image [image-type path entry config] | |
(let [ap-name (image-type {:header "Preview" :thumbnail "Thumbnail"}) | |
cq-name (image-type {:header "/jcr:content/leftcolumn/article/headerimage" | |
:thumbnail "/jcr:content/leftcolumn/article/thumbnailimage"})] | |
(when-let [source-url (photo-url ap-name entry)] | |
(when-let [ext (photo-type ap-name entry)] | |
(when-let [temp-file (save-image source-url ext)] | |
(try | |
(let [url (str (:author-host config) path cq-name) | |
ap-id (str/trim (str (photo-id ap-name entry))) | |
existing-ap-id (str/trim (str (:pw:apPhotoId (author-json (str path cq-name ".0.json") config))))] | |
(if (= ap-id existing-ap-id) | |
(info "photo unchanged, not reposting") | |
(do | |
(info "posting" (name image-type) "image" temp-file "to" url) | |
(client/post url | |
{:multipart [{:name "./file" :content temp-file} | |
{:name "_charset_" :content "utf-8"}] | |
:basic-auth (:auth config)}) | |
(info "posting photo id" ap-id) | |
(client/post url | |
{:form-params {:pw:apPhotoId ap-id} | |
:basic-auth (:auth config)})))) | |
(catch Exception e (error e)) | |
(finally (.delete temp-file)))))))) | |
(defn import-entry [time-of-import path entry branding import-count {:keys [base-path] :as config}] | |
(try | |
(let [ramp-results (call-ramp entry config)] | |
(when-let [title (get-title entry)] | |
(when-not (or (str/blank? title) | |
(str/blank? (sanitize-headline title)) | |
(< (count (get-text entry)) 120)) | |
(info "starting import of" title) | |
(post-to-sling | |
path | |
(sanitize-headline title) | |
(ap-article-xml-to-cq-json entry ramp-results branding (beacon base-path time-of-import)) | |
identity | |
config) | |
(doseq [image-type [:header :thumbnail]] | |
(post-article-image image-type | |
(str path "/" (sanitize-headline title)) | |
entry | |
config)) | |
(activate (str path "/" (sanitize-headline title)) config) | |
(swap! import-count inc)))) | |
(catch Exception e (error e)))) | |
(defn import-existing-entry [time-of-import existing entry branding import-count {:keys [author-host auth] :as config}] | |
(try | |
(let [content-node-path (:path existing) | |
path (str/replace-first content-node-path "/jcr:content" "") | |
taggable-text-node-path (str content-node-path "/leftcolumn/article/taggabletext") | |
header-image-node-path (str content-node-path "/leftcolumn/article/headerimage") | |
article-node-path (str content-node-path "/leftcolumn/article")] | |
(info "checking hash") | |
(if (up-to-date? entry path config) | |
(info path "is already up to date based on content-hash, not importing") | |
(when-let [title (get-title entry)] | |
(when-not (or (str/blank? title) | |
(str/blank? (sanitize-headline title)) | |
(< (count (get-text entry)) 120)) | |
(info "starting import of existing entry" title) | |
(let [ramp-results (call-ramp entry config)] | |
(client/post (str author-host content-node-path) | |
{ | |
:basic-auth auth | |
:form-params { | |
:jcr:title (sanitize-title (get-title entry)) | |
:cq:tags (into [] (get ramp-results "cq:tags" "")) | |
:topiccenterdefault (into [] (get ramp-results "topiccenterdefault" "")) | |
:pw:hash (content-hash entry) | |
"_charset_" "utf-8" | |
} | |
})) | |
(client/post (str author-host taggable-text-node-path) | |
{ | |
:basic-auth auth | |
:form-params { | |
:text (taggable-text branding entry) | |
"_charset_" "utf-8" | |
} | |
}) | |
(client/post (str author-host header-image-node-path) | |
{ | |
:basic-auth auth | |
:form-params { | |
:jcr:description (photo-caption entry) | |
:alt (photo-caption entry) | |
"_charset_" "utf-8" | |
} | |
}) | |
(client/post (str author-host article-node-path) | |
{ | |
:basic-auth auth | |
:form-params { | |
:synopsis (synopsis entry) | |
"_charset_" "utf-8" | |
} | |
}) | |
(doseq [image-type [:header :thumbnail]] | |
(post-article-image image-type | |
path | |
entry | |
config)) | |
(activate path config) | |
(swap! import-count inc))))) | |
(catch Exception e (error e)))) | |
(defn find-existing-news [{:keys [author-host auth base-path]}] | |
(let [format-string "%s/etc.query.json?statement=/jcr:root/content//*[@pw:beacon='%s']&property=pw:hash&property=pw:apid&rows=10000" | |
this-month-url (format format-string author-host (beacon base-path (now))) | |
last-month-url (format format-string author-host (beacon base-path (add-months (now) -1))) | |
month-before-last-url (format format-string author-host (beacon base-path (add-months (now) -2))) | |
this-month-results (read-json (:body (client/get this-month-url {:basic-auth auth}))) | |
last-month-results (read-json (:body (client/get last-month-url {:basic-auth auth}))) | |
month-before-last-results (read-json (:body (client/get month-before-last-url {:basic-auth auth}))) | |
query-results (concat this-month-results last-month-results month-before-last-results)] | |
(info "checking for existing articles from this month at " this-month-url) | |
(info "checking for existing articles from last month at " last-month-url) | |
(info "checking for existing articles from month before last at " month-before-last-url) | |
(into {} | |
(for [{:keys [^String pw:hash ^String pw:apid ^String jcr:path]} query-results :when (.startsWith jcr:path "/content")] | |
(do (when-not pw:apid (info "existing article without id " jcr:path)) | |
{pw:apid {:hash pw:hash :path jcr:path}}))))) | |
(defn parent-path [s] (subs s 0(.lastIndexOf s (int \/)))) | |
(defn label [s] (subs s (inc (.lastIndexOf s (int \/))))) | |
(defn write-ap-news-pages [url branding time-of-import config] | |
(let [path (:base-path config) | |
feed (parse-feed url) | |
entries (get-entries feed) | |
ninety-days-earlier (add-days time-of-import -90) | |
import-count (atom 0) | |
existing-news-map (try (find-existing-news config) | |
(catch Exception e {})) | |
ap-ids (map management-id entries) | |
new-ids (remove existing-news-map ap-ids) | |
grouped-entries (group-by management-id entries)] | |
(info "url=" url) | |
(info "existing count = " (count existing-news-map)) | |
(info "new count = " (count new-ids)) | |
(create-if-needed path | |
(year time-of-import) | |
(year time-of-import) | |
true | |
config) | |
(create-if-needed (year-path path time-of-import) | |
(month time-of-import) | |
(month-name time-of-import) | |
false | |
config) | |
(create-if-needed (month-path path time-of-import) | |
(day time-of-import) | |
(day time-of-import) | |
false | |
config) | |
;(delete-from-sling (day-path path ninety-days-earlier) config) | |
;(delete-from-sling (month-path path (add-months ninety-days-earlier -1)) config) | |
;(delete-from-sling (year-path path (add-years ninety-days-earlier -1)) config) | |
(doseq [[ap-id grouped-entry] grouped-entries | |
:let [sequence-number (apply max (map management-seq grouped-entry)) | |
entry (first (filter #(= sequence-number (management-seq %)) grouped-entry))]] | |
(info "ap-id=" ap-id) | |
(info "versions=" (count grouped-entry)) | |
(info "entry=" (management-id entry) (management-seq entry)) | |
(if-let [existing (existing-news-map ap-id)] | |
(do (import-existing-entry time-of-import existing entry branding import-count config) | |
(info "updated existing entry at " (:path existing) (:hash existing) (content-hash entry))) | |
(import-entry time-of-import (day-path path time-of-import) entry branding import-count config))) | |
(info "updating sequenceNumber and minDateTme") | |
(if (> @import-count 0) | |
(update-seq-and-since path | |
(sequencing feed "sequenceNumber" "Id") | |
(sequencing feed "minDateTime" "Value") | |
config) | |
(info "no news is good news")) | |
(info "ap news import complete"))) | |
(defn write-ap-news-pages-since-last [this path url ramp-site branding time-of-import] | |
(info "the apNewsWriter=" this) | |
(info "path=" path) | |
(info "url=" url) | |
(info "ramp-site=" ramp-site) | |
(info "branding=" branding) | |
(let [config (assoc (.state this) :base-path path :ramp-site ramp-site :auto-activate? true) | |
{:keys [sequence-number since]} (try (author-json (str path "/jcr:content/last-import.json") config) | |
(catch Exception e {:sequence-number nil :since nil})) | |
url (if-not sequence-number url (str (remove-seq-and-since url) "&sequenceNumber=" sequence-number "&minDateTime=" since)) | |
url (str url "&ConsumerReady=TRUE")] | |
(write-ap-news-pages url branding time-of-import config))) | |
(defn -writeAPNewsPagesMap [this m] | |
(info "m=" m) | |
(write-ap-news-pages-since-last this (get m "path") (get m "feedUrl") (get m "rampSite") (get m "branding") (now))) | |
(defn -writeAPNewsPages [this path url ramp-site branding] | |
(write-ap-news-pages-since-last this path url ramp-site branding (now))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment