Skip to content

Instantly share code, notes, and snippets.

@vseloved
Created April 19, 2017 14:41
Show Gist options
  • Save vseloved/85408fff4e4559555665e3acd906a0e8 to your computer and use it in GitHub Desktop.
Save vseloved/85408fff4e4559555665e3acd906a0e8 to your computer and use it in GitHub Desktop.
Unfinished code for NNSE calculation
(in-package #:nlp.embeddings)
(named-readtables:in-readtable rutilsx-readtable)
(eval-always
(rename-package "BKNR.SKIP-LIST" "BKNR.SKIP-LIST" '("SKLIST")))
(defun gather-freq-dict (vecs dir &key (cutoff 0) (dump-file "/tmp/dict.txt"))
(let ((dict #h(equal))
(idxs #h(equal))
(total 0)
(i -1))
(dolist (file (uiop:directory-files dir))
(dolines (line file)
(dolist (word (split #\Space line))
(:+ (get# (normalize vecs word) dict 0))
(:+ total))))
(rem# nil dict)
(format *debug-io* "Freq dict - total words: ~A, distinct words: ~A, ~
after frequency cutoff (~A): ~A~%"
total (ht-count dict) cutoff
(dotable (word freq dict
(ht-count dict))
(when (<= freq cutoff)
(rem# word dict))))
(dotable (word freq dict)
(:= (? idxs word) (:+ i)))
(when dump-file
(with-out-file (out dump-file)
(format out "~A~%" total)
(dotable (word idx idxs)
(format out "~A ~A ~A~%" word idx (? dict word)))))
(values dict
idxs
total)))
(defun gather-cooc-mat (vecs dir dict idxs total
&key (window 10) (weighting 'identity)
(dump-file "/tmp/cooc.txt"))
(let ((coocs (make-array (ht-count dict) :initial-contents
(maptimes (ht-count dict)
^(make 'sklist:skip-list))))
(min most-positive-fixnum)
(max 0)
(total-neighbours 0)
(cc -1))
(dolist (file (uiop:directory-files dir))
(dolines (line file)
(when (zerop (rem (:+ cc) 100000)) (princ "."))
(let ((words (coerce (remove nil
(mapcar ^(? idxs (normalize vecs %))
(split #\Space line)))
'vector)))
(dotimes (i (length words))
(let ((cooc (? coocs (? words i))))
(iter (:for j :from (max 0 (- i window))
:to (min (1- (length words))))
(unless (= i j)
(sklist:skip-list-insert
cooc j (1+ (or (sklist:skip-list-search cooc j)
0))))))))))
(when dump-file
(with-out-file (out dump-file)
(dovec (cooc coocs)
(let (cur)
(sklist:map-skip-list ^(push (fmt "~A:~A" % %%) cur)
cooc)
(format out "~{~A~^ ~}~%" cur)))))
(dovec (cooc coocs)
(let ((count (sklist:skip-list-length cooc)))
(when (< count min) (:= min count))
(when (> count max) (:= max count))
(:+ total-neighbours count)))
(format *debug-io* "Coocurences - max: ~A, min: ~A, mean: ~$~%"
max min (/ total-neighbours (length coocs)))
coocs))
(defun calc-ppmi-mat (total dict idxs coocs)
(let ((freqs #h())
(ppmis (make-array (length coocs))))
(dotable (word idx idxs)
(:= (? freqs idx) (? dict word)))
(dotable (_ idx idxs)
(with ((cooc (? coocs idx))
(ppmi #h()))
(sklist:map-skip-list ^(:= (? ppmi %)
(if (= % idx) 0
(max (log (/ (* %% total)
(* (? freqs idx)
(? freqs %)))
2)
0)))
cooc)
(:= (? ppmis idx) ppmi)))
ppmis))
;;; SVD
(defclass sparse-mat ()
((data :initarg :data :accessor mat-data)))
(defmethod fsvd:height-of ((mat sparse-mat) &key densep)
(declare (ignore densep))
(length @mat.data))
(defmethod fsvd:width-of ((mat sparse-mat) &key densep)
(declare (ignore densep))
(length @mat.data))
(defmethod fsvd:size-of ((mat sparse-mat))
(let ((rez 0))
(dovec (ht @mat.data)
(:+ rez (ht-count ht)))
rez))
(defmethod fsvd:map-matrix (fn (mat sparse-mat))
(let ((h (fsvd:height-of mat))
(w (fsvd:width-of mat))
(cnt -1))
(loop :for i :below h :do
(let ((row (aref (slot-value mat 'data) i)))
(when (zerop (rem i 1000)) (princ "."))
(loop :for j :below w :do
(when-it (get# j row)
(call fn i j it (:+ cnt))))))))
(defmethod fsvd:do-matrix-macro-name ((mat sparse-mat))
'do-sparse-mat)
(defmacro do-sparse-mat (((i j val dense-index) mat)
&body body)
(with ((width (gensym))
(matrix (gensym))
(row (gensym))
(declarations body (fsvd::split-body body)))
`(let* ((,matrix ,mat)
(,dense-index 0)
(,width (the fixnum (fsvd:width-of ,matrix))))
(declare (type (integer 0 #.(1- most-positive-fixnum)) ,dense-index))
(dotimes (,i (the fixnum (fsvd:height-of ,matrix)))
(when (zerop (rem ,i 1000)) (princ "."))
(let ((,row (aref (slot-value ,matrix 'data) ,i)))
(dotimes (,j ,width)
(let ((,val (get# ,j ,row)))
,@declarations
(when ,val
,@body
(:+ ,dense-index)))))))))
;;; NNSE
(defclass nnse (mem-vecs)
()
(:documentation
"NNSE word embeddings."))
(defun nnse-cost (mat a d)
(mat:nrm2 (mat:m- mat
(mat:m* a d))))
(defun col-unit-norm! (mat)
"Rescale each column of MAT to unit L2 norm."
(dotimes (j (? (mat:mat-dimensions mat) 1))
(let ((total 0))
(dotimes (i (? (mat:mat-dimensions mat) 0))
(:+ total (expt (mat:mref mat i j) 2)))
(dotimes (i (? (mat:mat-dimensions mat) 0))
(:/ (mat:mref mat i j) (sqrt total)))))
mat)
(defun row-unit-norm! (mat)
"Rescale each row of MAT to unit L2 norm."
(dotimes (i (? (mat:mat-dimensions mat) 0))
(let ((total 0))
(dotimes (j (? (mat:mat-dimensions mat) 1))
(:+ total (expt (mat:mref mat i j) 2)))
(dotimes (j (? (mat:mat-dimensions mat) 1))
(:/ (mat:mref mat i j) total))))
mat)
(defun calc-nnsc (mat k &key (sparsity 0.7) (inner-steps 100)
(ep 1e-3) (max-steps 1000))
(with ((step 0)
(d (? (mat:mat-dimensions mat) 0))
(n (? (mat:mat-dimensions mat) 1))
(W (mat:make-mat (list d k)))
(H (mat:make-mat (list k n)))
(cost 0)
(prev 0))
(dotimes (i k)
(dotimes (j n)
(:= (mat:mref H i j) (+ 1e-10 (random 0.1)))))
(:= cost (nnse-cost mat W H))
;; optimize
(loop :while (and (< step max-steps)
(> (abs (- cost prev)) ep)) :do
(format *debug-io* "Step ~A cost: ~A~%" step cost)
(dotimes (i d)
(dotimes (j k)
(:= (mat:mref W i j) 1)));(+ 1e-10 (random 0.1)))))
;; initial A optimization
(loop :repeat inner-steps :do
(mat:.*! (mat:.*! (mat:m* mat H :transpose-b? t)
(mat:.expt! (mat:m* (mat:m* W H)
H :transpose-b? t)
-1))
W))
;; enforce sparcity constraints
(dotimes (j k)
(let (vals)
(dotimes (i d)
(push (pair i (mat:mref W i j)) vals))
(:= vals (coerce (sort vals '< :key 'rt) 'vector))
(dotimes (i (floor (* sparsity d)))
(:= (mat:mref W (? vals i 0) j) 0))))
;; stepwise W & H optimization
(loop :repeat inner-steps :do
(let ((div (mat:m* W
(mat:m* H H
:transpose-b? t))))
(dotimes (i d)
(dotimes (j k)
(when (zerop (mat:mref div i j))
(:= (mat:mref div i j) 1))))
(mat:.*! (mat:.*! (mat:m* mat H :transpose-b? t)
(mat:.expt! div -1))
W))
(mat:.*! (mat:.*! (mat:m* W mat :transpose-a? t)
(mat:.expt! (mat:m* (mat:m* W W :transpose-a? t)
H)
-1))
H)
(print (list W H)))
;; (mat:axpy! (- learning-rate)
;; (mat:m* (mat:m- (mat:m* a d)
;; mat)
;; d
;; :transpose-b? t)
;; a)
;; (dotimes (i (? (mat:mat-dimensions mat) 0))
;; (dotimes (j dims)
;; (when (minusp (mat:mref a i j))
;; (:= (mat:mref a i j) 0))))
;; (col-unit-norm! a)
;; (mat:axpy! (- learning-rate)
;; (mat:m* a
;; (mat:m- (mat:m* a s)
;; mat)
;; :transpose-a? t)
;; s)
;; (dotimes (i dims)
;; (dotimes (j (? (mat:mat-dimensions mat) 1))
;; (when (minusp (mat:mref s i j))
;; (:= (mat:mref s i j) 0))))
;; (row-unit-norm! s)
;; (mat:axpy! learning-rate
;; (mat:m- (mat:m* a mat :transpose-a? t)
;; (mat:.+! la
;; (mat:m* (mat:m* a a :transpose-a? t)
;; d)))
;; d)
;; (mat:.*! (mat:.*! (mat:m* a mat :transpose-a? t)
;; (mat:.expt! (mat:.+! la
;; (mat:m* (mat:m* a a :transpose-a? t)
;; d))
;; -1))
;; d)
;; (row-unit-norm! d)
(:+ step)
(:= prev cost
cost (nnse-cost mat W H)))
(format *debug-io* "Final cost: ~A~%" cost)
(values W
H)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment