Created February 24, 2019 22:28
Clojure compress / decompress data examples
{:deps {org.clojure/clojure {:mvn/version "1.10.0"}
com.taoensso/nippy {:mvn/version "2.14.0"}
org.apache.commons/commons-compress {:mvn/version "1.18"}}}
(ns lzw
(:require [clojure.set]
[ :refer [file output-stream input-stream] :as io]
[taoensso.nippy :as nippy])
(:import ( DataOutputStream DataInputStream File)
[ ZipEntry ZipOutputStream ZipInputStream]
(org.apache.commons.compress.compressors CompressorStreamFactory)
(org.apache.commons.compress.archivers.sevenz SevenZOutputFile SevenZFile)
(org.apache.commons.compress.archivers ArchiveInputStream)))
(defn gunzip
"decompress data.
input: gzipped data which can be opened by io/input-stream.
output: something which can be copied to by io/copy (e.g. filename ...)."
[input output & opts]
(with-open [input (-> input io/input-stream GZIPInputStream.)]
(apply io/copy input output opts)))
(defn gzip
"compress data.
input: something which can be copied from by io/copy (e.g. filename ...).
output: something which can be opend by io/output-stream.
The bytes written to the resulting stream will be gzip compressed."
[input output & opts]
(with-open [output (-> output io/output-stream GZIPOutputStream.)]
(apply io/copy input output opts)))
(gzip (file "big.txt") (file "big.gz"))
(gunzip (file "big.gz") (file "big.1.txt")))
;; ZIP
(defn zip-file
"compress file or folder
`input-file-or-folder` - filename or folder to be compressed.
`out-file` - filename of output archive"
[input-file-or-folder out-file]
(with-open [zip (ZipOutputStream. (io/output-stream out-file))]
(doseq [f (file-seq (io/file input-file-or-folder)) :when (.isFile f)]
(.putNextEntry zip (ZipEntry. (.getPath f)))
(io/copy f zip)
(.closeEntry zip))))
(comment (zip-file "./test" ""))
(defn unzip-file
"uncompress zip archive.
`input` - name of zip archive to be uncompressed.
`output` - name of folder where to output."
[input output]
(with-open [stream (-> input io/input-stream ZipInputStream.)]
(loop [entry (.getNextEntry stream)]
(if entry
(let [save-path (str output File/separatorChar (.getName entry))
out-file (File. save-path)]
(if (.isDirectory entry)
(if-not (.exists out-file)
(.mkdirs out-file))
(let [parent-dir (File. (.substring save-path 0 (.lastIndexOf save-path (int File/separatorChar))))]
(if-not (.exists parent-dir) (.mkdirs parent-dir))
( stream out-file)))
(recur (.getNextEntry stream)))))))
(comment (unzip-file "" "./test2"))
;; 7-zip
(defn compress7z-file
"compress file using 7-zip"
[input-filename archive-name]
(let [in-file (file input-filename)
in (input-stream in-file)
seven-z (SevenZOutputFile. (file archive-name))
arch-entry (.createArchiveEntry seven-z in-file input-filename)
buf (byte-array 1024)]
(.putArchiveEntry seven-z arch-entry)
(loop [n (.read in buf)]
(when (> n 0)
(.write seven-z buf 0 n)
(recur (.read in buf))))
(.closeArchiveEntry seven-z)
(.close seven-z)))
(compress-7zip "big.txt" "big.7z"))
(defn decompress-7zip
"decompress 7-zip archive.
`input` - name of 7-zip archive to be uncompressed.
`output` - name of folder where to output."
[input output]
(with-open [s7-zip-archive (-> (file input) SevenZFile.)]
(loop [entry (.getNextEntry s7-zip-archive)]
(if entry
(let [save-path (str output File/separatorChar (.getName entry))
out-file (File. save-path)]
(if (.isDirectory entry)
(if-not (.exists out-file)
(.mkdirs out-file))
(let [parent-dir (File. (.substring save-path 0 (.lastIndexOf save-path (int File/separatorChar))))
buf-size 1024
buf (byte-array buf-size)
entry-size (.getSize entry)
out (output-stream out-file)]
(if-not (.exists parent-dir) (.mkdirs parent-dir))
(loop [remain-bytes (.getSize entry)
n (.read s7-zip-archive buf)]
(when (> n 0)
(.write out buf 0 n)
(when (> remain-bytes 0)
(recur (- remain-bytes n) (.read s7-zip-archive buf)))))
(.close out)))
(recur (.getNextEntry s7-zip-archive)))))))
(decompress-7zip "big.7z" "./"))
;; LZW
(defn make-dict []
(let [vals (range 256)]
(zipmap (map vector vals) vals)))
(defn compress- [{:keys [dict index w out] :as a} b]
(let [buffer (conj w b)]
(if (contains? dict buffer)
(assoc a :w buffer)
{:dict (assoc dict buffer index)
:index (inc index)
:out (conj out (get dict w))
:w [b]})))
(defn compress [data]
(let [initial-data {:dict (make-dict) :index 256 :w [] :out []}
{:keys [dict index w out] :as a} (reduce compress- initial-data (seq data))]
(conj out (get dict w))))
(defn decompress- [{:keys [dict index w out] :as a} code]
(let [entry (if (contains? dict code) (get dict code) (conj w (first w)))]
{:dict (assoc dict index (conj w (first entry)))
:index (inc index)
:out (conj out entry)
:w entry}))
(defn decompress [data]
(let [f [(first data)]
initial-data {:dict (clojure.set/map-invert (make-dict)) :index 256 :w f :out f}
result (reduce decompress- initial-data (rest data))]
(flatten (:out result))))
;; end of LZW
(String. (byte-array (decompress (compress (.getBytes "ABACABACABADE")))))
(nippy/freeze-to-file "a.lzw" (compress (.getBytes "ABACABACABADE")))
(String. (byte-array (decompress (nippy/thaw-from-file "a.lzw"))))
(def s (slurp "src/lzw.clj"))
(def sb (.getBytes s))
(nippy/freeze-to-file "lzw.lzw" (compress sb))
(def ds (String. (byte-array (decompress (nippy/thaw-from-file "lzw.lzw")))))
(= s ds)
(def cb (compress sb))
(def db (decompress cb))
(def ds (String. (byte-array db)))
(= ds s)
(count cb)
(count db)
(def cb (short-array (compress sb)))
(alength cb)
(with-open [out (DataOutputStream. (output-stream (file "lzw.lzw")))]
(run! (fn [w] (.writeShort out w)) cb))
(def in (DataInputStream. (input-stream (file "lzw.lzw"))))
;;(def buf (short-array (/ (.length ^ (file "lzw.lzw")) 2)))
;;(alength buf)
(def rb (loop [buf []
avail (> (.available in) 0)]
(if-not avail
(recur (conj buf (.readShort in)) (> (.available in) 0)))))
(count rb)
(def dd (decompress rb))
(String. (byte-array dd))
(def s (slurp "big.txt"))
(def sb (.getBytes s))
(alength sb)
(def cb (int-array (compress sb)))
(count cb)
(with-open [out (DataOutputStream. (output-stream (file "big.lzw")))]
(run! (fn [w] (.writeInt out w)) cb))
(def in (DataInputStream. (input-stream (file "big.lzw"))))
;;(def buf (short-array (/ (.length ^ (file "lzw.lzw")) 2)))
;;(alength buf)
(def rb (loop [buf []
avail (> (.available in) 0)]
(if-not avail
(recur (conj buf (.readInt in)) (> (.available in) 0)))))
(count rb)
(def dd (decompress rb))
(String. (byte-array dd))
See also LZW compression

