Skip to content

Instantly share code, notes, and snippets.

@behrica
Last active December 17, 2020 22:29
Show Gist options
  • Save behrica/1c532dc122c68b5d831a6bacd7c944d6 to your computer and use it in GitHub Desktop.
Save behrica/1c532dc122c68b5d831a6bacd7c944d6 to your computer and use it in GitHub Desktop.
(ns sciloj.smile-nlp-example
(:require [clojure.string :as str]
[pppmap.core :as ppp]
[tablecloth.api :as tc]
[tablecloth.api.split :as split])
(:import smile.classification.Maxent
smile.nlp.normalizer.SimpleNormalizer
smile.nlp.stemmer.PorterStemmer
[smile.nlp.tokenizer SimpleSentenceSplitter SimpleTokenizer]
[smile.validation Accuracy ConfusionMatrix]))
;; converts the token-counts to th e format the Maxtent funcions wants
;; a native array, where every int is a index in the vocabulary
;; If indeix is present, the token is prsent in text
(defn bow->sparse-indices [bow vocab->index-map]
(->>
(merge-with
(fn [index count]
[index count])
vocab->index-map
bow)
vals
(filter vector?)
(map first)
(into-array Integer/TYPE)))
;; converts text to token counts (a map token -> count)
(defn default-text->bow [text]
(let [normalizer (SimpleNormalizer/getInstance)
tokenizer (SimpleTokenizer. )
sentence-splitter (SimpleSentenceSplitter/getInstance)
stemmer (PorterStemmer.)]
(->> text
(.normalize normalizer)
(.split sentence-splitter)
(map #(.split tokenizer %))
(map seq)
flatten
(remove nil?)
(map #(.stem stemmer %))
(map str/lower-case)
frequencies)))
;;; take to n occuring terms from all token-frquency ables
;;; Makes a global vocabulary first
(defn ->vocabulary-top-n [ds bow-col n]
(let [vocabulary
(->>
(apply merge-with + (get ds bow-col))
(sort-by second)
reverse
(take n)
keys)
vocab->index-map (zipmap vocabulary (range))
]
{:vocab vocabulary
:vocab->index-map vocab->index-map
:index->vocab-map (clojure.set/map-invert vocab->index-map)
})
)
;; vectorizes the text via text->bow-fn
(defn count-vectorize [ds text-col bow-col text->bow-fn]
(tc/add-or-replace-column ds bow-col
(fn [ds]
(ppp/ppmap-with-progress
"text->bow"
1000
text->bow-fn
(get ds text-col)))))
;; converts bow column to sparse indeices column
(defn bow->sparse-array [ds bow-col indices-col vocab->index-map]
(tc/add-or-replace-column ds
indices-col
(fn [ds]
(ppp/ppmap-with-progress
"bow->sparse"
1000
#(bow->sparse-indices % vocab->index-map)
(get ds bow-col)))))
;; create final native arrays for Maxent
(defn train-test-arrays [train-test-split bow-sparse target]
(let [train-array
(into-array ^"[[Ljava.lang.Integer"
(get (train-test-split :train) bow-sparse))
test-array
(into-array ^"[[Ljava.lang.Integer"
(get (train-test-split :test) bow-sparse))
train-score-array
(into-array
Integer/TYPE
(get (train-test-split :train) target))
test-score-array
(into-array
Integer/TYPE
(get (train-test-split :test) target))]
{:x-train train-array
:x-test test-array
:y-train train-score-array
:y-test test-score-array
}))
;;; a library
;; -----------------------------------------
;; user code
(def reviews
(->
(tc/dataset "./Reviews.csv" {:key-fn keyword})
(tc/select-columns [:Text :Score])
(tc/drop-missing)
(tc/head 10000)
(count-vectorize :Text :bow default-text->bow)))
(def vocabulary
(->vocabulary-top-n reviews :bow 10000))
(def reviews
(bow->sparse-array reviews :bow :bow-sparse (:vocab->index-map vocabulary) ))
(def train-test-split (first (split/split reviews :holdout {:ratio 0.3})))
(def p (count (vocabulary :vocab->index-map)))
(def t-t-arrays (train-test-arrays train-test-split :bow-sparse :Score))
(def maxent (Maxent/multinomial p (:x-train t-t-arrays) (:y-train t-t-arrays)))
(def predictions (.predict maxent (:x-test t-t-arrays)))
(println
(ConfusionMatrix/of predictions (:y-test t-t-arrays)))
(println
(Accuracy/of predictions (:y-test t-t-arrays)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment