Skip to content

Instantly share code, notes, and snippets.

@marktriggs
Last active December 17, 2015 10:48
Show Gist options
  • Save marktriggs/5597058 to your computer and use it in GitHub Desktop.
Save marktriggs/5597058 to your computer and use it in GitHub Desktop.
(ns marcgrep.destinations.counts
(:use marcgrep.protocols
clojure.java.io
clojure.contrib.map-utils
clojure.xml)
(:refer-clojure :exclude [next flush])
(:import [org.marc4j.marc Record VariableField DataField ControlField Subfield]
[java.io BufferedWriter FileOutputStream]))
;;; Turn a Marc4J record into a nested structure containing counts
;; Like:
;;
;; {:datafields {"907" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "b" 1}},
;; "260" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}},
;; "040" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "c" 1, "d" 1}},
;; "880" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}},
;; "066" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"c" 1}},
;; "035" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1}},
;; "300" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "c" 1}},n
;; "245" {:count 1, :indicator {1 {"0" 1}, 2 {"0" 1}}, :subfields {"6" 1, "a" 1, "b" 1, "c" 1}},
;; "246" {:count 1, :indicator {1 {"3" 1}, 2 {"0" 1}}, :subfields {"a" 1}},
;; "984" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1, "i" 1, "c" 1}},
;; "700" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1}},
;; "800" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"6" 1, "a" 1}},
;; "998" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"b" 1, "c" 1, "d" 1, "e" 1, "g" 1}},
;; "019" {:count 1, :indicator {1 {"1" 1}, 2 {" " 1}}, :subfields {"a" 1}},
;; "902" {:count 1, :indicator {1 {" " 1}, 2 {" " 1}}, :subfields {"a" 1}}},
;; :controlfields {"001" 1, "005" 1, "008" 1},
;; :leader {:position {6 "a", 7 "m"}},
;; :record-count 1}
(defn record-to-counts [rec]
{:record-count 1
:leader (into {} (map (fn [position]
[position {(-> rec .getLeader .toString (nth position) str) 1}])
[6 7]))
:controlfields (frequencies (map #(str (.getTag %))
(filter #(instance? org.marc4j.marc.ControlField %)
(.getVariableFields rec))))
:datafields (into {} (map (fn [df]
[(.getTag df)
{:count 1
:indicator {1 {(str (.getIndicator1 df)) 1}
2 {(str (.getIndicator2 df)) 1}}
:subfields (frequencies (map #(str (.getCode %))
(.getSubfields df)))}])
(filter #(instance? org.marc4j.marc.DataField %)
(.getVariableFields rec))))})
;; Merge the numbers of two count structures together
(defn add-counts-to-map [results rec]
(deep-merge-with + results (record-to-counts rec)))
;; Turn a count structure into XML
(defn result-to-xml [result]
(clojure.xml/emit
{:tag :marcstats :attrs {:record_count (:record-count result)}
:content [
;; Leader stats
{:tag :leader :attrs nil
:content (map (fn [position]
{:tag :position :attrs {:index position}
:content (map (fn [[value count]]
{:tag :value :attrs {:count count} :content [value]})
((:leader result) position))})
[6 7])}
;; Control field stats
{:tag :controlfields :attrs nil
:content (map (fn [[value count]]
{:tag :field :attrs {:tag value :count count}})
(:controlfields result))}
;; Data field stats
{:tag :datafields :attrs nil
:content (map (fn [[tag df]]
{:tag :field
:attrs {:tag tag :count (:count df)}
:content [{:tag :indicators
:content (map (fn [indicator]
{:tag :indicator :attrs {:code indicator}
:content (map (fn [[value count]]
{:tag :value :attrs {:count count}
:content [value]})
((:indicator df) indicator))})
[1 2])}
{:tag :subfields
:content (map (fn [[value count]]
{:tag :subfield :attrs {:code value :count count}})
(:subfields df))}]})
(:datafields result))}]}))
;;; The definition for our MARC destination
(deftype CountingDestination [^BufferedWriter writer counts]
MarcDestination
(init [this])
(write [this record]
(swap! counts (fn [map] (add-counts-to-map map record))))
(flush [this])
(close [this]
(binding [*out* writer]
(result-to-xml (deref counts)))
(.close writer)))
(defn output-file [config job]
(file (:output-dir @config)
(str (:id @job) ".txt")))
(defn get-output-for [config job]
(let [output (output-file config job)]
(when (.exists output)
output)))
(defn get-destination-for [config job]
(let [outfile (output-file config job)
outfh (writer outfile)]
(CountingDestination. outfh (atom {}))))
(defn delete-job [config job]
(let [output (output-file config job)]
(when (.exists output)
(.delete output))))
(marcgrep.core/register-destination
{:description "Counts"
:get-destination-for get-destination-for
:get-output-for get-output-for
:delete-job delete-job})
<?xml version='1.0' encoding='utf-8'?>
<marcstats record_count='9285'>
<leader>
<position index='6'>
<value count='7971'>a</value>
<value count='111'>c</value>
<value count='1'>d</value>
<value count='1059'>e</value>
<value count='2'>f</value>
<value count='3'>g</value>
<value count='20'>i</value>
<value count='10'>j</value>
<value count='77'>k</value>
<value count='20'>m</value>
<value count='3'>o</value>
<value count='8'>t</value>
</position>
<position index='7'>
<value count='1'>i</value>
<value count='40'>a</value>
<value count='461'>c</value>
<value count='49'>d</value>
<value count='354'>s</value>
<value count='8380'>m</value>
</position>
</leader>
<controlfields>
<field tag='006' count='35' />
<field tag='007' count='6416' />
<field tag='001' count='9285' />
<field tag='005' count='9285' />
<field tag='008' count='9285' />
</controlfields>
<datafields>
<field tag='245' count='9285'>
<indicators>
<indicator code='1'>
<value count='94'></value>
<value count='2094'>0</value>
<value count='7097'>1</value>
</indicator>
<indicator code='2'>
<value count='1'>5</value>
<value count='1'>1</value>
<value count='367'>3</value>
<value count='585'>2</value>
<value count='1281'>4</value>
<value count='7050'>0</value>
</indicator>
</indicators>
<subfields>
<subfield code='n' count='63' />
<subfield code='f' count='5' />
<subfield code='p' count='69' />
<subfield code='h' count='6659' />
<subfield code='6' count='141' />
<subfield code='b' count='3738' />
<subfield code='a' count='9285' />
<subfield code='c' count='5200' />
</subfields>
</field>
</datafields>
</marcstats>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment