Created
March 2, 2017 23:57
-
-
Save halgari/1ce239addc964b3bca0eb73601e05589 to your computer and use it in GitHub Desktop.
Word Count (MMap)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; On my box, wc processes the 11GB JSON file in 50sec | |
;; This code does it in 11sec | |
;; Compiled with lein uberjar, and run with java -server -jar word-count-STANDALONE.jar | |
(ns word-count.core | |
(:import (java.io RandomAccessFile File) | |
(java.nio.channels FileChannel$MapMode) | |
(java.nio ByteBuffer)) | |
(:gen-class)) | |
(set! *warn-on-reflection* true) | |
;; Not really needed, just checking for anything I missed | |
(set! *unchecked-math* :warn-on-boxed) | |
(defn count-lines [^ByteBuffer buf ^long start ^long end] | |
(loop [idx start | |
lines 0] | |
(if (< idx end) | |
(if (= 10 (.get buf idx)) | |
(recur (inc idx) (inc lines)) | |
(recur (inc idx) lines)) | |
lines))) | |
(defn do-it [] | |
(let [file-size (.length (File. "/Users/tim/Downloads/json/out.json")) | |
chan (-> (RandomAccessFile. "/Users/tim/Downloads/json/out.json", "rw") | |
(.getChannel)) | |
chunk-size (* 1024 1024 1024) | |
;; Byte buffers can only be Integer/MAX_VALUE long, so we have to chunk them | |
chunks (for [^long offset (range 0 file-size chunk-size)] | |
(let [buf-size (min chunk-size | |
(- file-size offset)) | |
chunk (.map chan (FileChannel$MapMode/READ_ONLY) | |
offset buf-size)] | |
(future (count-lines chunk 0 buf-size))))] | |
(reduce (fn [^long acc f] | |
(+ acc ^long @f)) | |
0 chunks)) | |
) | |
(defn -main [] | |
(println "Running...") | |
(println "RESULT:" (time (do-it))) | |
(println "Done") | |
(shutdown-agents)) | |
(comment | |
(time (do-it))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For those wondering about line 19, 10 is the ascii code for a line feed.