Created
August 1, 2014 19:28
-
-
Save ikarth/7994dd7899e4c6ffa74a to your computer and use it in GitHub Desktop.
Callisto Email CSV Redacting and Formatting in Clojure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns callisto-tools.core | |
(:require [clojure.pprint] | |
[clojure.math.numeric-tower] | |
[clojure-csv.core]) | |
) | |
(defn personal-info [] | |
[ | |
["string to delete" "string to replace it with"] | |
] | |
) | |
;(defn replace-info [file & replacements] | |
; (let [replacement-list (partition 2 replacements)] | |
; replacement-list)) | |
; ;(reduce #(apply clojure.string/replace %1 %2) file replacement-list))) | |
(defn redact-personal-info | |
"Take the file and the list of personal information transformations and redact all personal info." | |
[file parsing-list] | |
(reduce #(apply clojure.string/replace [%1 (first %2) (second %2)]) file parsing-list)) | |
(defn redact-header [file] | |
(clojure.string/replace | |
(clojure.string/replace | |
(clojure.string/replace file #"(?s)Subject:.{70,320}?<The Moderator>" "") | |
#"(?s)Subject:.{70,420}?The Moderator" "") | |
#"<(.+?: the.+?)>" | |
"$1" | |
) | |
) | |
(defn redact-email-chains [file] | |
(clojure.string/replace | |
(clojure.string/replace file | |
#"(?m)^>.*?$\r\n" | |
"") | |
#"(?m)^On .*?wrote:" | |
"") | |
) | |
(defn mark-linebreaks | |
"Find the linebreaks and mark their position for later splitting. May need updating for non-Windows files." | |
[source-text] | |
(clojure.string/replace source-text #"\r\n" "¶")) | |
(defn remove-duplication | |
"If the string is two duplicate strings, remove one of them." | |
[text] | |
(if (= (clojure.string/trim (subs text (/ (.length text) 2))) | |
(clojure.string/trim (subs text 0 (/ (.length text) 2)))) | |
(clojure.string/trim (subs text (/ (.length text) 2))) | |
text;(clojure.string/trim text) | |
)) | |
(defn remove-qtd-space [text] | |
(clojure.string/replace (clojure.string/replace-first text #"(?m)^\" " "\"") | |
#"^\"(.+?)\"$" | |
"$1")) | |
(defn process-csv [csv-file] | |
(conj | |
(map (fn [one] | |
(if (> (.length one) 5) | |
[(remove-qtd-space (clojure.string/trim (nth one 0))) | |
(remove-qtd-space (remove-duplication (clojure.string/trim (nth one 1)))) | |
(remove-qtd-space (remove-duplication (clojure.string/trim (nth one 2)))) | |
(nth one 3) | |
;(nth one 3) | |
;(nth one 4) | |
(clojure.string/replace | |
(clojure.string/replace | |
(clojure.string/replace | |
(mark-linebreaks (clojure.string/trim (nth one 5))) | |
#"\" The Moderator" "") | |
#"(?m)----- Original Message.*$" "") | |
#"(?m)----- Forwarded.*$" "") | |
] | |
) | |
) | |
csv-file) | |
["Subject" "Source" "Target" "Date" "Body"] | |
)) | |
(defn read-file | |
"Read the emails from a CSV file, do basic processing." | |
[filename] | |
(clojure-csv.core/parse-csv | |
(redact-email-chains | |
(redact-header | |
(redact-personal-info | |
(slurp filename) (personal-info)))))) | |
(redact-email-chains | |
(redact-header | |
(redact-personal-info | |
"string to completely erase" | |
(personal-info)))) | |
(process-csv | |
(read-file "data//index_c2_B.csv")) | |
(spit "data//output_c2_2.csv" | |
(apply str | |
(clojure-csv.core/write-csv | |
(process-csv | |
(read-file "data//index_c2_B.csv"))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment