-
-
Save pithyless/44ce8908cee196fd3f5ace685b8db2f4 to your computer and use it in GitHub Desktop.
ngrams with transducers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns demo | |
(:require [clojure.java.io :as io] | |
[net.cgrand.xforms :as x] | |
[net.cgrand.xforms.io :as xio] | |
[clojure.string :as str])) | |
(defn ngrams [n N sent] | |
(let [s (cstr/split sent #"\s+")] | |
(for [i (range n (inc N))] | |
(map (partial cstr/join " " ) (partition i 1 s))))) | |
(defn ng-count [n N file] | |
(with-open [rdr (io/reader file)] | |
(->> (line-seq rdr) | |
(map (partial ngrams n N ) ) | |
(apply map concat) | |
(map frequencies ) | |
;;(map (partial into (sorted-map))) | |
doall))) | |
(defn ngrams2 [min max text] | |
(let [text (str/split text #"\s+")] | |
(mapcat #(sequence (x/partition %) | |
text) | |
(range min (inc max))))) | |
(defn ng-count2 [min max file] | |
(into {} | |
(comp | |
(mapcat (partial ngrams2 min max)) | |
(map #(str/join " " %)) | |
(x/by-key identity x/count)) | |
(xio/lines-in file))) | |
(comment | |
;; Testing with Mark Twain - http://www.gutenberg.org/files/74/74-0.txt | |
(time (ffirst (ng-count 2 3 (io/resource "twain.txt")))) | |
;; "Elapsed time: 262.708325 msecs" | |
(time (first (ng-count2 2 3 (io/resource "twain.txt")))) | |
;; "Elapsed time: 163.145644 msecs" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment