Last active
December 17, 2015 10:48
-
-
Save marktriggs/5597058 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns marcgrep.destinations.counts | |
(:use marcgrep.protocols | |
clojure.java.io | |
clojure.contrib.map-utils | |
clojure.xml) | |
(:refer-clojure :exclude [next flush]) | |
(:import [org.marc4j.marc Record VariableField DataField ControlField Subfield] | |
[java.io BufferedWriter FileOutputStream])) | |
;;; Turn a Marc4J record into a nested structure containing counts | |
;; Like: | |
;; | |
;; {:datafields {"907" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "b" 1}}, | |
;; "260" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}}, | |
;; "040" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "c" 1, "d" 1}}, | |
;; "880" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}}, | |
;; "066" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"c" 1}}, | |
;; "035" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1}}, | |
;; "300" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "c" 1}},n | |
;; "245" {:count 1, :indicator {1 {"0" 1}, 2 {"0" 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}}, | |
;; "246" {:count 1, :indicator {1 {"3" 1}, 2 {"0" 1}}, :subfields {"a" 1}}, | |
;; "984" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "i" 1, "c" 1}}, | |
;; "700" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1}}, | |
;; "800" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1}}, | |
;; "998" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"b" 1, "c" 1, "d" 1, "e" 1, "g" 1}}, | |
;; "019" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"a" 1}}, | |
;; "902" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1}}}, | |
;; :controlfields {"001" 1, "005" 1, "008" 1}, | |
;; :leader {:position {6 "a", 7 "m"}}, | |
;; :record-count 1} | |
(defn record-to-counts [rec] | |
{:record-count 1 | |
:leader (into {} (map (fn [position] | |
[position {(-> rec .getLeader .toString (nth position) str) 1}]) | |
[6 7])) | |
:controlfields (frequencies (map #(str (.getTag %)) | |
(filter #(instance? org.marc4j.marc.ControlField %) | |
(.getVariableFields rec)))) | |
:datafields (into {} (map (fn [df] | |
[(.getTag df) | |
{:count 1 | |
:indicator {1 {(str (.getIndicator1 df)) 1} | |
2 {(str (.getIndicator2 df)) 1}} | |
:subfields (frequencies (map #(str (.getCode %)) | |
(.getSubfields df)))}]) | |
(filter #(instance? org.marc4j.marc.DataField %) | |
(.getVariableFields rec))))}) | |
;; Merge the numbers of two count structures together | |
(defn add-counts-to-map [results rec] | |
(deep-merge-with + results (record-to-counts rec))) | |
;; Turn a count structure into XML | |
(defn result-to-xml [result] | |
(clojure.xml/emit | |
{:tag :marcstats :attrs {:record_count (:record-count result)} | |
:content [ | |
;; Leader stats | |
{:tag :leader :attrs nil | |
:content (map (fn [position] | |
{:tag :position :attrs {:index position} | |
:content (map (fn [[value count]] | |
{:tag :value :attrs {:count count} :content [value]}) | |
((:leader result) position))}) | |
[6 7])} | |
;; Control field stats | |
{:tag :controlfields :attrs nil | |
:content (map (fn [[value count]] | |
{:tag :field :attrs {:tag value :count count}}) | |
(:controlfields result))} | |
;; Data field stats | |
{:tag :datafields :attrs nil | |
:content (map (fn [[tag df]] | |
{:tag :field | |
:attrs {:tag tag :count (:count df)} | |
:content [{:tag :indicators | |
:content (map (fn [indicator] | |
{:tag :indicator :attrs {:code indicator} | |
:content (map (fn [[value count]] | |
{:tag :value :attrs {:count count} | |
:content [value]}) | |
((:indicator df) indicator))}) | |
[1 2])} | |
{:tag :subfields | |
:content (map (fn [[value count]] | |
{:tag :subfield :attrs {:code value :count count}}) | |
(:subfields df))}]}) | |
(:datafields result))}]})) | |
;;; The definition for our MARC destination | |
(deftype CountingDestination [^BufferedWriter writer counts] | |
MarcDestination | |
(init [this]) | |
(write [this record] | |
(swap! counts (fn [map] (add-counts-to-map map record)))) | |
(flush [this]) | |
(close [this] | |
(binding [*out* writer] | |
(result-to-xml (deref counts))) | |
(.close writer))) | |
(defn output-file [config job] | |
(file (:output-dir @config) | |
(str (:id @job) ".txt"))) | |
(defn get-output-for [config job] | |
(let [output (output-file config job)] | |
(when (.exists output) | |
output))) | |
(defn get-destination-for [config job] | |
(let [outfile (output-file config job) | |
outfh (writer outfile)] | |
(CountingDestination. outfh (atom {})))) | |
(defn delete-job [config job] | |
(let [output (output-file config job)] | |
(when (.exists output) | |
(.delete output)))) | |
(marcgrep.core/register-destination | |
{:description "Counts" | |
:get-destination-for get-destination-for | |
:get-output-for get-output-for | |
:delete-job delete-job}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version='1.0' encoding='utf-8'?> | |
<marcstats record_count='9285'> | |
<leader> | |
<position index='6'> | |
<value count='7971'>a</value> | |
<value count='111'>c</value> | |
<value count='1'>d</value> | |
<value count='1059'>e</value> | |
<value count='2'>f</value> | |
<value count='3'>g</value> | |
<value count='20'>i</value> | |
<value count='10'>j</value> | |
<value count='77'>k</value> | |
<value count='20'>m</value> | |
<value count='3'>o</value> | |
<value count='8'>t</value> | |
</position> | |
<position index='7'> | |
<value count='1'>i</value> | |
<value count='40'>a</value> | |
<value count='461'>c</value> | |
<value count='49'>d</value> | |
<value count='354'>s</value> | |
<value count='8380'>m</value> | |
</position> | |
</leader> | |
<controlfields> | |
<field tag='006' count='35' /> | |
<field tag='007' count='6416' /> | |
<field tag='001' count='9285' /> | |
<field tag='005' count='9285' /> | |
<field tag='008' count='9285' /> | |
</controlfields> | |
<datafields> | |
<field tag='245' count='9285'> | |
<indicators> | |
<indicator code='1'> | |
<value count='94'></value> | |
<value count='2094'>0</value> | |
<value count='7097'>1</value> | |
</indicator> | |
<indicator code='2'> | |
<value count='1'>5</value> | |
<value count='1'>1</value> | |
<value count='367'>3</value> | |
<value count='585'>2</value> | |
<value count='1281'>4</value> | |
<value count='7050'>0</value> | |
</indicator> | |
</indicators> | |
<subfields> | |
<subfield code='n' count='63' /> | |
<subfield code='f' count='5' /> | |
<subfield code='p' count='69' /> | |
<subfield code='h' count='6659' /> | |
<subfield code='6' count='141' /> | |
<subfield code='b' count='3738' /> | |
<subfield code='a' count='9285' /> | |
<subfield code='c' count='5200' /> | |
</subfields> | |
</field> | |
</datafields> | |
</marcstats> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment