Created
May 31, 2013 15:34
-
-
Save thattommyhall/5685808 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns groupings-clj.core | |
(:use [clojure.java.io :as io]) | |
(:require [clojure.string :as str]) | |
(:import (org.apache.lucene.analysis Analyzer PorterStemFilter PorterStemmer) | |
(org.apache.lucene.analysis.standard StandardAnalyzer) | |
(org.apache.lucene.util Version) | |
(org.apache.lucene.analysis.tokenattributes TermAttribute) | |
(org.apache.lucene.analysis.standard StandardTokenizer) | |
;; (org.apache.lucene.analysis.en PorterStemFilter PorterStemmer) | |
;; (org.apache.lucene.analysis.de GermanAnalyzer GermanStemFilter) | |
;; (org.apache.lucene.analysis.fr FrenchAnalyzer) | |
;; (org.apache.lucene.analysis.es SpanishAnalyzer) | |
;; (org.apache.lucene.analysis.snowball SnowballFilter) | |
;; (org.apache.lucene.analysis.tokenattributes CharTermAttribute) | |
;; (org.tartarus.snowball.ext FrenchStemmer, SpanishStemmer) | |
(java.io StringReader ByteArrayInputStream) | |
(java.util.zip InflaterInputStream) | |
(sun.misc BASE64Decoder))) | |
(defn lines-from-gzip [filename] | |
(let [i (io/reader | |
(java.util.zip.GZIPInputStream. | |
(io/input-stream filename)))] | |
(line-seq i))) | |
(defn keywords-from-file [filename] | |
(for [line (lines-from-gzip filename)] | |
(nth (str/split line #"\t") 4))) | |
(defn tokenize [str] | |
(StandardTokenizer. Version/LUCENE_30 (StringReader. str))) | |
(defn porterstem-filter [tokenstream] | |
(PorterStemFilter. tokenstream)) | |
(defn tokens [tokenstream] | |
(let [termAttr (.getAttribute tokenstream TermAttribute)] | |
(lazy-seq (if (not (.incrementToken tokenstream)) | |
nil | |
(cons (.term termAttr) | |
(tokens tokenstream)))))) | |
(defn stem [input] | |
(let [tokenStream (porterstem-filter (tokenize input))] | |
(for [word (tokens tokenStream)] | |
word))) | |
(def filename "/home/thattommyhall/Downloads/all_kw.tsv.gz") | |
(defn -main [] | |
(time (dorun (map stem (lines-from-gzip filename)))) | |
(time (dorun (pmap stem (lines-from-gzip filename)))) | |
(time (dorun (map stem (lines-from-gzip filename)))) | |
(time (dorun (pmap stem (lines-from-gzip filename)))) | |
(time (dorun (lines-from-gzip filename))) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment