Skip to content

Instantly share code, notes, and snippets.

@thattommyhall
Last active December 18, 2015 00:29
Show Gist options
  • Save thattommyhall/5697116 to your computer and use it in GitHub Desktop.
Save thattommyhall/5697116 to your computer and use it in GitHub Desktop.
(ns groupings-clj.core
(:use [clojure.java.io :as io])
(:require [clojure.string :as str])
(:import (org.apache.lucene.analysis Analyzer PorterStemFilter PorterStemmer)
(org.apache.lucene.analysis.standard StandardAnalyzer)
(org.apache.lucene.util Version)
(org.apache.lucene.analysis.tokenattributes TermAttribute)
(org.apache.lucene.analysis.standard StandardTokenizer)
;; (org.apache.lucene.analysis.en PorterStemFilter PorterStemmer)
;; (org.apache.lucene.analysis.de GermanAnalyzer GermanStemFilter)
;; (org.apache.lucene.analysis.fr FrenchAnalyzer)
;; (org.apache.lucene.analysis.es SpanishAnalyzer)
;; (org.apache.lucene.analysis.snowball SnowballFilter)
;; (org.apache.lucene.analysis.tokenattributes CharTermAttribute)
;; (org.tartarus.snowball.ext FrenchStemmer, SpanishStemmer)
(java.io StringReader ByteArrayInputStream)
(java.util.zip InflaterInputStream)
(sun.misc BASE64Decoder)))
(defn lines-from-gzip [filename]
(let [i (io/reader
(java.util.zip.GZIPInputStream.
(io/input-stream filename)))]
(line-seq i)))
(defn keywords-from-file [filename]
(for [line (lines-from-gzip filename)]
(nth (str/split line #"\t") 4)))
(defn tokenize [str]
(StandardTokenizer. Version/LUCENE_30 (StringReader. str)))
(defn porterstem-filter [tokenstream]
(PorterStemFilter. tokenstream))
(defn tokens [tokenstream]
(let [termAttr (.getAttribute tokenstream TermAttribute)]
(lazy-seq (if (not (.incrementToken tokenstream))
nil
(cons (.term termAttr)
(tokens tokenstream))))))
(defn stem [input]
(let [tokenStream (porterstem-filter (tokenize input))]
(for [word (tokens tokenStream)]
word)))
(def filename "/home/thattommyhall/Downloads/all_kw.tsv.gz")
(defn -main []
(time (dorun (map stem (lines-from-gzip filename))))
(time (dorun (pmap stem (lines-from-gzip filename))))
(time (dorun (map stem (lines-from-gzip filename))))
(time (dorun (pmap stem (lines-from-gzip filename))))
(time (dorun (lines-from-gzip filename)))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment