-
-
Save ricardojmendez/ea7ebbe273cf6f9bcddb to your computer and use it in GitHub Desktop.
Work-in-progress sketch for importing a batched set of pages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn import-page-set! | |
"Import a set of pages. page-set is expected to be a collection of | |
hashmaps containing a :url and potentially a :provenance." | |
[conn page-set] | |
(let [with-provenance (->> page-set set (map add-default-provenance)) | |
all-loaded (map load-resource-from-map with-provenance) | |
all-grouped (group-by :hasError all-loaded) | |
no-errors (all-grouped false) | |
no-errors-meta (map #(hash-map :origin % | |
:node (node-data-from-meta %)) | |
no-errors) | |
; Deal with errors | |
; A page having an error invalidates all existing data, and we will replace it | |
; with the meta we can get from the url. | |
with-errors (all-grouped true) | |
errors-meta (map #(hash-map :node (node-data-from-url (:provenance %) %) :code (b/code-from-url (:provenance %))) with-errors) | |
; Figure out which URLs are redirects | |
redirects (filter #(not= (get-in % [:origin :provenance]) (get-in % [:node :url])) no-errors-meta) | |
redirect-urls (map #(get-in % [:origin :provenance]) redirects) | |
redirect-meta (map #(hash-map :node (node-data-from-url % {:isRedirect true}) :code (b/code-from-url %)) redirect-urls) | |
; Attach the links to the parsed pages | |
with-links (map #(assoc % :links (get-wiki-links (get-in % [:origin :res]) | |
(get-in % [:node :host])) | |
:code (get-in % [:node :code])) | |
no-errors-meta) | |
link-url-set (-> (map :links with-links) flatten distinct) | |
all-link-meta (map #(hash-map :code (b/code-from-url %) | |
:node (node-data-from-url %)) | |
link-url-set) | |
import-code-set (concat | |
(map #(get-in % [:node :code]) no-errors-meta) | |
(map #(get-in % [:node :code]) redirect-meta) | |
(map #(get-in % [:node :code]) errors-meta)) | |
used-link-meta (remove #(b/in-seq? import-code-set (:code %)) all-link-meta) ; Remove the codes we already know from the import set | |
link-code-set (map :code used-link-meta) | |
all-codes (-> (concat import-code-set link-code-set) set) | |
existing (db/query-for-codes conn all-codes) | |
ids-by-code (db/map-by-code existing) | |
existing-codes (keys ids-by-code) | |
to-create (difference all-codes existing-codes) | |
; Get the batch operation data | |
; First, let's get the nodes we need to create. We filter against to-create, | |
; which will initially be the larger set but will diminish in size the more | |
; pages we import. | |
to-create-link (into [] (r/filter #(b/in-seq? to-create (:code %)) used-link-meta)) | |
import-groups (group-by #(b/in-seq? to-create (:code %)) (concat with-links redirect-meta errors-meta)) | |
to-create-import (set (import-groups true)) | |
all-create-nodes (->> (concat to-create-link to-create-import) (pmap :node)) | |
; Get the relationships to create | |
all-rels (-> (pmap map-link-pairs with-links) flatten) | |
; doall on the batch so that don't count the cost of materializing the collection | |
; as part of the profiled cost on create | |
create-batch (db/batch-create-ops all-create-nodes) | |
rel-batch (db/batch-rel-ops all-rels) | |
; Get the nodes to update. We get the database ids on the query above | |
update-nodes (pmap :node (import-groups false)) | |
update-batch (db/batch-update-ops (map #(hash-map :id (get-in ids-by-code [(:code %) :id]) | |
:node %) | |
update-nodes)) | |
; ... aaand create | |
created (db/execute-batch conn (concat create-batch update-batch rel-batch)) | |
] | |
(doall created) | |
all-codes) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment