Skip to content

Instantly share code, notes, and snippets.

@ikarth
Created August 1, 2014 19:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikarth/7994dd7899e4c6ffa74a to your computer and use it in GitHub Desktop.
Save ikarth/7994dd7899e4c6ffa74a to your computer and use it in GitHub Desktop.
Callisto Email CSV Redacting and Formatting in Clojure
(ns callisto-tools.core
(:require [clojure.pprint]
[clojure.math.numeric-tower]
[clojure-csv.core])
)
(defn personal-info []
[
["string to delete" "string to replace it with"]
]
)
;(defn replace-info [file & replacements]
; (let [replacement-list (partition 2 replacements)]
; replacement-list))
; ;(reduce #(apply clojure.string/replace %1 %2) file replacement-list)))
(defn redact-personal-info
"Take the file and the list of personal information transformations and redact all personal info."
[file parsing-list]
(reduce #(apply clojure.string/replace [%1 (first %2) (second %2)]) file parsing-list))
(defn redact-header [file]
(clojure.string/replace
(clojure.string/replace
(clojure.string/replace file #"(?s)Subject:.{70,320}?<The Moderator>" "")
#"(?s)Subject:.{70,420}?The Moderator" "")
#"<(.+?: the.+?)>"
"$1"
)
)
(defn redact-email-chains [file]
(clojure.string/replace
(clojure.string/replace file
#"(?m)^>.*?$\r\n"
"")
#"(?m)^On .*?wrote:"
"")
)
(defn mark-linebreaks
"Find the linebreaks and mark their position for later splitting. May need updating for non-Windows files."
[source-text]
(clojure.string/replace source-text #"\r\n" "¶"))
(defn remove-duplication
"If the string is two duplicate strings, remove one of them."
[text]
(if (= (clojure.string/trim (subs text (/ (.length text) 2)))
(clojure.string/trim (subs text 0 (/ (.length text) 2))))
(clojure.string/trim (subs text (/ (.length text) 2)))
text;(clojure.string/trim text)
))
(defn remove-qtd-space [text]
(clojure.string/replace (clojure.string/replace-first text #"(?m)^\" " "\"")
#"^\"(.+?)\"$"
"$1"))
(defn process-csv [csv-file]
(conj
(map (fn [one]
(if (> (.length one) 5)
[(remove-qtd-space (clojure.string/trim (nth one 0)))
(remove-qtd-space (remove-duplication (clojure.string/trim (nth one 1))))
(remove-qtd-space (remove-duplication (clojure.string/trim (nth one 2))))
(nth one 3)
;(nth one 3)
;(nth one 4)
(clojure.string/replace
(clojure.string/replace
(clojure.string/replace
(mark-linebreaks (clojure.string/trim (nth one 5)))
#"\" The Moderator" "")
#"(?m)----- Original Message.*$" "")
#"(?m)----- Forwarded.*$" "")
]
)
)
csv-file)
["Subject" "Source" "Target" "Date" "Body"]
))
(defn read-file
"Read the emails from a CSV file, do basic processing."
[filename]
(clojure-csv.core/parse-csv
(redact-email-chains
(redact-header
(redact-personal-info
(slurp filename) (personal-info))))))
(redact-email-chains
(redact-header
(redact-personal-info
"string to completely erase"
(personal-info))))
(process-csv
(read-file "data//index_c2_B.csv"))
(spit "data//output_c2_2.csv"
(apply str
(clojure-csv.core/write-csv
(process-csv
(read-file "data//index_c2_B.csv")))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment