Skip to content

Instantly share code, notes, and snippets.

@llibra
Created September 1, 2011 08:14
Show Gist options
  • Save llibra/1185691 to your computer and use it in GitHub Desktop.
Save llibra/1185691 to your computer and use it in GitHub Desktop.
Sunday Quick Search
;; char-code-limit - 1の大きさの配列を表として使うバージョン
;; 0x10ffffという巨大な表を作るため、空間効率が非常に悪い
;; 表を外部に出せば、同じパターンを繰り返し検索する場合には効果的と思われるが、
;; 検索する回数が少ないと元を取れない
(defun quick-search/array (string-x string-y)
(declare (optimize speed (debug 0) (safety 0))
(type simple-string string-x string-y))
(let* ((length-x (length string-x))
(length-y (length string-y))
(boundary (- length-y length-x)))
(declare (type fixnum length-x length-y))
(labels ((compare (start)
(dotimes (n length-x t)
(unless (eql (aref string-x n) (aref string-y (+ start n)))
(return nil))))
(make-shift-table ()
(do ((n 0 (1+ n))
(table (make-array #.(1- char-code-limit)
:element-type 'fixnum
:initial-element (1+ length-x))))
((= n length-x) table)
(setf (aref table (char-code (aref string-x n)))
(- length-x n))))
(shift-length (char table)
(aref table (char-code char))))
(do* ((table (make-shift-table))
(n 0 (+ n (shift-length (aref string-y next) table)))
(next (+ n length-x) (+ n length-x)))
((> n boundary) nil)
(declare (type fixnum n))
(when (compare n) (return n))))))
;; ハッシュテーブルを表として使うバージョン
;; メモリ消費量は少ないが、実行時のハッシュテーブル参照にコストがかかる
(defun quick-search/hash (string-x string-y)
(declare (optimize speed (debug 0) (safety 0))
(type simple-string string-x string-y))
(let* ((length-x (length string-x))
(length-y (length string-y))
(boundary (- length-y length-x)))
(declare (type fixnum length-x length-y))
(labels ((compare (start)
(dotimes (n length-x t)
(unless (eql (aref string-x n) (aref string-y (+ start n)))
(return nil))))
(make-shift-table ()
(do ((n 0 (1+ n))
(table (make-hash-table)))
((= n length-x) table)
(setf (gethash (char-code (aref string-x n)) table)
(- length-x n))))
(shift-length (char table)
(let* ((code (char-code char))
(value (gethash code table)))
(the fixnum (if value value (1+ length-x))))))
(do* ((table (make-shift-table))
(n 0 (+ n (shift-length (aref string-y next) table)))
(next (+ n length-x) (+ n length-x)))
((> n boundary) nil)
(declare (type fixnum n))
(when (compare n) (return n))))))
;; 最下位オクテットだけに注目した表を作るバージョン
;; 配列を使うが、大きさは256なのでメモリ消費が少なく、参照のコストも低い
;; シフトするときに本来より移動量が少なくなる可能性がある折衷案
(defun quick-search/fuzzy (string-x string-y)
(declare (optimize speed (debug 0) (safety 0))
(type simple-string string-x string-y))
(let* ((length-x (length string-x))
(length-y (length string-y))
(boundary (- length-y length-x)))
(declare (type fixnum length-x length-y))
(labels ((compare (start)
(dotimes (n length-x t)
(unless (eql (aref string-x n) (aref string-y (+ start n)))
(return nil))))
(lowest-octet (i)
(logand i #xff))
(make-shift-table ()
(do ((n 0 (1+ n))
(table (make-array 256 :element-type 'fixnum
:initial-element (1+ length-x))))
((= n length-x) table)
(let ((code (char-code (aref string-x n))))
(setf (aref table (lowest-octet code))
(- length-x n)))))
(shift-length (char table)
(aref table (lowest-octet (char-code char)))))
(do* ((table (make-shift-table))
(n 0 (+ n (shift-length (aref string-y next) table)))
(next (+ n length-x) (+ n length-x)))
((> n boundary) nil)
(declare (type fixnum n))
(when (compare n) (return n))))))
;; 総当たり
(defun brute-force (string-x string-y)
(declare (optimize speed (debug 0) (safety 0))
(type simple-string string-x string-y))
(let* ((length-x (length string-x))
(length-y (length string-y))
(boundary (- length-y length-x)))
(declare (type fixnum length-x length-y))
(labels ((compare (start)
(dotimes (n length-x t)
(unless (eql (aref string-x n) (aref string-y (+ start n)))
(return nil)))))
(dotimes (n (1+ boundary) nil)
(when (compare n) (return n))))))
;; SBCL 1.0.50での計測結果
;; 郵便番号データ(http://www.post.japanpost.jp/zipcode/download.html)
;; UTF-8/LFに変換したものをサンプルデータに使う
(defparameter *csv*
(with-open-file (s "13tokyo.csv" :external-format :utf-8)
(let ((buf (make-string (file-length s))))
(read-sequence buf s)
buf)))
;; 短い文字列の検索
(progn
(time (loop for n below 100
maximize (quick-search/array "三宅島" *csv*)))
(time (loop for n below 100
maximize (quick-search/hash "三宅島" *csv*)))
(time (loop for n below 100
maximize (quick-search/fuzzy "三宅島" *csv*)))
(time (loop for n below 100
maximize (brute-force "三宅島" *csv*)))
(time (loop for n below 100
maximize (search "三宅島" *csv*))))
#|
Evaluation took:
1.015 seconds of real time
0.984375 seconds of total run time (0.703125 user, 0.281250 system)
[ Run times consist of 0.310 seconds GC time, and 0.675 seconds non-GC time. ]
96.95% CPU
2,375,367,484 processor cycles
445,645,600 bytes consed
Evaluation took:
0.563 seconds of real time
0.546875 seconds of total run time (0.546875 user, 0.000000 system)
97.16% CPU
1,333,861,536 processor cycles
65,464 bytes consed
Evaluation took:
0.125 seconds of real time
0.125000 seconds of total run time (0.125000 user, 0.000000 system)
100.00% CPU
285,249,090 processor cycles
95,952 bytes consed
Evaluation took:
0.312 seconds of real time
0.312500 seconds of total run time (0.312500 user, 0.000000 system)
100.00% CPU
722,638,280 processor cycles
0 bytes consed
Evaluation took:
1.391 seconds of real time
1.390625 seconds of total run time (1.390625 user, 0.000000 system)
100.00% CPU
3,223,313,660 processor cycles
0 bytes consed
|#
;; 長い文字列の検索
(progn
(time (loop for n below 100
maximize (quick-search/array "江戸川(1〜3丁目、4丁目1〜14番)"
*csv*)))
(time (loop for n below 100
maximize (quick-search/hash "江戸川(1〜3丁目、4丁目1〜14番)"
*csv*)))
(time (loop for n below 100
maximize (quick-search/fuzzy "江戸川(1〜3丁目、4丁目1〜14番)"
*csv*)))
(time (loop for n below 100
maximize (brute-force "江戸川(1〜3丁目、4丁目1〜14番)"
*csv*)))
(time (loop for n below 100
maximize (search "江戸川(1〜3丁目、4丁目1〜14番)" *csv*))))
#|
Evaluation took:
0.953 seconds of real time
0.953125 seconds of total run time (0.703125 user, 0.250000 system)
[ Run times consist of 0.340 seconds GC time, and 0.614 seconds non-GC time. ]
100.00% CPU
2,242,954,602 processor cycles
445,645,600 bytes consed
Evaluation took:
0.094 seconds of real time
0.093750 seconds of total run time (0.093750 user, 0.000000 system)
100.00% CPU
202,415,262 processor cycles
65,400 bytes consed
Evaluation took:
0.015 seconds of real time
0.015625 seconds of total run time (0.015625 user, 0.000000 system)
106.67% CPU
42,310,968 processor cycles
96,216 bytes consed
Evaluation took:
0.172 seconds of real time
0.171875 seconds of total run time (0.171875 user, 0.000000 system)
100.00% CPU
406,886,326 processor cycles
0 bytes consed
Evaluation took:
0.828 seconds of real time
0.812500 seconds of total run time (0.812500 user, 0.000000 system)
98.07% CPU
1,931,473,950 processor cycles
0 bytes consed
|#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment