Skip to content

Instantly share code, notes, and snippets.

@dmitric
Created February 21, 2011 01:51
Show Gist options
  • Save dmitric/836560 to your computer and use it in GitHub Desktop.
Save dmitric/836560 to your computer and use it in GitHub Desktop.
hadoop to xml pipes example
(import '(java.io BufferedReader InputStreamReader FileReader File BufferedWriter FileWriter))
(defn cmd [p] (.. Runtime getRuntime (exec (str p))))
(defn cmdout [o]
(let [r (BufferedReader.
(InputStreamReader.
(.getInputStream o)))]
(dorun (map println (line-seq r)))))
(defn markup [tag-name value]
(str "<" tag-name ">" value "</" tag-name ">"))
(defn build-indexes [output-path num-shards]
;;functions to help with file processing
(defn process-file [file-name line-func]
(with-open [rdr (BufferedReader. (FileReader. file-name))]
(reduce line-func 0 (line-seq rdr))))
(defn process-line [acc line]
(let [tokens (seq (.split line "[\t]+"))]
(print (str "id:"(first tokens) " to " (mod (read-string (first tokens)) num-shards) " "))
(write-lines (str output-path "/" (mod (read-string (first tokens)) num-shards) ".xml") (str (last tokens) "\n"))
(println (last tokens))))
(defn write-lines [file-name lines]
(with-open [#^BufferedWriter wtr (BufferedWriter. (FileWriter. file-name true))]
(doseq [line lines] (.write wtr (str line)))))
;;run your hadoop job, example of one
(?<- (hfs-textline output-path) [?i ?xml] (id ?p ?i)(age ?p ?a)(gender ?p ?g)
(markup "name" ?p :> ?nx)(markup "age" ?a :> ?ax)(markup "gender" ?g :> ?gx)
(str "<sphinx:document id=\""?i"\">" ?nx ?ax ?gx "</sphinx:document>":> ?xml))
(println (str "organize entries into " num-shards " files"))
;;create num-shard many files with schema at head of file TODO
;;distibute map reduce results
(let [files (filter #(not= "." (subs (.getName %1) 0 1) ) (seq (.listFiles (File. output-path))))]
(dorun (map #(process-file % process-line) files)))
;;close off docset tag TODO
(println "run indexer")
;; for each xml file, run indexer. need to have each index described in sphinx.conf TODO
(cmdout (cmd "indexer napoli")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment