Skip to content

Instantly share code, notes, and snippets.

@ktakashi
Created June 25, 2012 18:15
Show Gist options
  • Save ktakashi/2990302 to your computer and use it in GitHub Desktop.
Save ktakashi/2990302 to your computer and use it in GitHub Desktop.
Levenshtein distance
#|
Calculate levenshtein distance network.
|#
(defvar *distance* 1)
(defun levenshtein-distance (word1 word2)
(declare (optimize (safety 0) (speed 3) (debug 0))
(type simple-string word1 word2))
(flet ((make-table (word1 word2)
(declare (optimize (safety 0) (speed 3) (debug 0))
(type string word1 word2))
(let* ((x (length word1))
(y (length word2))
(table (make-array (list (+ x 1) (+ y 1)))))
(declare (type fixnum x y))
(dotimes (i x)
(declare (type fixnum i))
(setf (aref table (+ i 1) 0) (+ i 1)))
(dotimes (i y)
(declare (type fixnum i))
(setf (aref table 0 (+ i 1)) (+ i 1)))
table)))
(let ((d (make-table word1 word2))
(w1 (length word1))
(w2 (length word2)))
(declare (type fixnum w1 w2))
(dotimes (i w1)
(declare (type fixnum i))
(dotimes (j w2)
(declare (type fixnum j))
(if (eql (aref word1 i) (aref word2 j))
(setf (aref d (+ i 1) (+ j 1)) (aref d i j))
(setf (aref d (+ i 1) (+ j 1))
(let ((x (aref d i (+ j 1)))
(y (aref d (+ i 1) j))
(z (aref d i j)))
(declare (type fixnum x y z))
(let ((v (min (+ x 1) (+ y 1) (+ z 1))))
(declare (type fixnum v))
v))))))
(aref d w1 w2))))
(defun levenshtein-search (word word-list previous)
(declare (optimize (safety 0) (speed 3) (debug 0))
(type simple-string word))
(flet ((check (s1 s2)
(declare (optimize (safety 0) (speed 3) (debug 0))
(type simple-string s1 s2))
(let ((v (abs (- (length s1) (length s2)))))
(declare (type fixnum v *distance*))
(or (= v 0) (= v *distance*)))))
(let ((r nil))
(loop :for list :of-type simple-string :in word-list
:do (let* ((precheck (and (check word list)
(not (gethash list previous nil))))
(w (if precheck
(levenshtein-distance word list)
-1)))
(declare (type fixnum w *distance*))
(if (= w *distance*)
(setf r (cons (cons word list) r)))))
r)))
(defun levenshtein (seed word-list previous)
(declare (optimize (safety 0) (speed 3) (debug 0)))
(loop :for word :of-type simple-string :in seed
:append (levenshtein-search word word-list previous)))
(defun file->string-list (file)
(with-open-file (in file :direction :input)
(let ((r nil)
(line nil))
(loop :while (setq line (read-line in nil))
:do (setf r (cons line r)))
(reverse r))))
(defun print-network (generation network)
(format t "~d generation (~d)~%" generation (length network))
(loop :for s :in network
:do (format t "~a:~a~%" (car s) (cdr s))))
(defun split-at (list count)
(loop
:for rest :on list
:repeat count
:collect (car rest) :into left
:finally (return (values left rest))))
(defun drop (xs n)
(loop :repeat (1+ n )
:for x :on xs
:finally (return x)))
(defun show-network (word data-file &key max-generation threads)
(declare (type simple-string word data-file))
(let* ((word-list (file->string-list data-file))
(first (list word))
(previous (make-hash-table :test #'equalp))
(seed first)
(network nil))
;; set first word to previous
(setf (gethash word previous) t)
(flet ((runner (creater retriever)
(let ((generation 1) (ts (funcall creater)))
(loop :while (let ((r (funcall retriever ts)))
(cond ((and max-generation
(= max-generation generation))
(print-network generation r)
nil)
(t (setf network r) network)))
:do (progn
(print-network generation network)
(setf seed (mapcar #'cdr network)
generation (+ 1 generation))
(loop :for p :in seed :do (setf (gethash p previous) t))
(setf ts (funcall creater)))))))
#+sbcl
(if (and threads (> threads 1))
(flet ((split-zip (word-list count)
(declare (type fixnum count))
(let ((count (ceiling (/ (length word-list) count))))
(declare (type fixnum count))
(loop :for rest :on word-list :by (lambda (l) (drop l count))
:collect (multiple-value-bind (left)
(split-at rest count)
left))))
(create-thread (seed wlist prev)
(sb-thread:make-thread
(lambda ()
(levenshtein seed wlist prev)))))
(let ((zip (split-zip word-list threads)))
(flet ((create-threads ()
(loop :for z :in zip
:collect (create-thread seed z previous))))
(runner #'create-threads
#'(lambda (ts)
(loop :for th :in ts
:append (sb-thread:join-thread th)))))))
(runner #'(lambda () (levenshtein seed word-list previous))
#'(lambda (r) r)))
#+(not sbcl)
(runner #'(lambda () (levenshtein seed word-list previous))
#'(lambda (r) r)))))
#|
To run
(show-network "your word" "your word list file")
|#
;;
;; Works on Sagittarius 0.3.4 alpha
;; can be 0.3.3 if you change (match rest ...) to (match (reverse! rest) ...)
;;
(import (rnrs)
(match)
(getopt)
(shorten)
(util file)
(srfi :1 lists)
(srfi :18 multithreading)
(srfi :26 cut)
(srfi :39 parameters)
(sagittarius control))
(define *distance* (make-parameter 1))
;; 2 demension table version
(define (levenshtein-distance word1 word2)
(define (table-ref t i j)
(vector-ref (vector-ref t i) j))
(define (table-set! t i j v)
(vector-set! (vector-ref t i) j v))
(define (make-table word1 word2)
(let1 table (make-vector (+ (string-length word1) 1) 0)
(dotimes (i (+ (string-length word1) 1))
(vector-set! table i (make-vector (+ (string-length word2) 1) 0)))
(dotimes (i (string-length word1))
(table-set! table (+ i 1) 0 (+ i 1)))
(dotimes (i (string-length word2))
(table-set! table 0 (+ i 1) (+ i 1)))
table))
(let ((d (make-table word1 word2))
(word1-size (string-length word1))
(word2-size (string-length word2)))
(dotimes (i word1-size)
(dotimes (j word2-size)
(if (char=? (string-ref word1 i) (string-ref word2 j))
(table-set! d (+ i 1) (+ j 1) (table-ref d i j))
(table-set! d (+ i 1) (+ j 1)
(min (+ (table-ref d i (+ j 1)) 1)
(+ (table-ref d (+ i 1) j) 1)
(+ (table-ref d i j) 1))))))
(table-ref d word1-size word2-size)))
(define (levenshtein seed word-list distance :key (check #f) (previous '()))
(define (search word word-list)
(let loop ((word-list word-list) (r '()))
(if (null? word-list)
(reverse! r)
(let1 w (and (or (not check)
(check word (car word-list)))
(not (hashtable-ref previous (car word-list) #f))
(levenshtein-distance word (car word-list)))
(loop (cdr word-list) (if (and w (= w distance))
(acons word (car word-list) r)
r))))))
(let1 net (map (cut search <> word-list) seed)
(let loop ((net net) (r '()))
(if (null? net)
r
(loop (cdr net) (append r (car net)))))))
(define (show-network word data-file :key (max-generation #f)
(threads 5))
(define (show generation network)
(print generation " generations (" (length network) ")")
(let ((sorted (list-sort (^(a b) (string<=? (car a) (car b))) network))
(current #f))
(for-each (^s (when (or (not current) (not (string=? current (car s))))
(set! current (car s)))
(print current ": " (cdr s)))
network))
(newline))
(define (create-thread seed word-list prev check distance)
(let1 t (make-thread
(lambda () (levenshtein seed word-list distance
:check check
:previous prev)))
(thread-start! t)
t))
(define (split-zip word-list count)
(let1 count (div (length word-list) count)
(let loop ((r '())
(current word-list))
(if (< (length current) count)
(if (null? current)
r
(cons current r))
(let*-values (((new rest) (split-at current count)))
(loop (cons new r) rest))))))
(let* ((word-list (file->string-list data-file))
(first (list word))
(previous (make-string-hashtable))
(seed first)
(distance (*distance*)))
;; not to caluculate obvious case
(define (check word1 word2)
(let1 v (abs (- (string-length word1) (string-length word2)))
(or (zero? v) (= distance v))))
(define (runner thread-creater result-retriever)
(let loop ((generations 1)
(ts (thread-creater)))
(let1 r (result-retriever ts)
(set! seed (map cdr r))
(for-each (cut hashtable-set! previous <> #t) seed)
(show generations r)
(unless (or (and max-generation
(>= generations max-generation))
(null? r))
(loop (+ generations 1) (thread-creater))))))
(hashtable-set! previous word #t)
(if (and threads (> threads 1))
(let ((zip (split-zip word-list threads)))
(define (create-threads)
(map (cut create-thread seed <> previous check distance) zip))
(runner create-threads
(lambda (ts)
(let loop ((r '()) (ts ts))
(if (null? ts)
r
(loop (append (thread-join! (car ts)) r)
(cdr ts)))))))
(runner (cut levenshtein seed word-list distance
:check check
:previous previous)
(^r r)))))
(define (usage name)
(print name " word data")
(exit 1))
(define (main args)
(with-args args
((check? (#\c "check") #f #f)
(generation (#\g "generation") #t "")
(threads (#\t "threads") #t "5")
. rest)
(if check?
(print (levenshtein-distance (car rest) (cadr rest)))
(match rest
((_ word in-file)
(show-network word in-file
:max-generation (string->number generation)
:threads (string->number threads)))
(_ (usage (car args)))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment