Created
May 16, 2020 13:46
-
-
Save christophejunke/a0458d87ff7bd48c344c8deeade9c767 to your computer and use it in GitHub Desktop.
Map lines over an internal buffer - the callback must copy the string if needed, otherwise the underlying data changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defpackage :map-lines (:use :cl)) | |
(in-package :map-lines) | |
(defparameter *max-line-size* 65536) | |
(defparameter *default-buffer-size* 16384) | |
(declaim (inline map-lines-on-shared-buffer)) | |
(define-condition internal-buffer-overflow (error) | |
((buffer :reader internal-buffer-overflow/buffer :initarg :buffer) | |
(variable :reader internal-buffer-overflow/variable :initarg :variable) | |
(size :reader internal-buffer-overflow/size :initarg :size))) | |
(defun map-lines-on-shared-buffer | |
(stream function &key (buffer-size *default-buffer-size*)) | |
"Call FUNCTION for each line read from STREAM, using an internal buffer. | |
FUNCTION should accept a single argument, a string which content is | |
only available while the callback is processing it. The same | |
underlying buffer is used for successive invocations of FUNCTION." | |
(check-type stream stream) | |
(check-type function (or function symbol)) | |
(check-type buffer-size fixnum) | |
(when (symbolp function) | |
(setf function (symbol-function function))) | |
(let ((view (make-array 0 | |
:element-type 'character | |
:adjustable t)) | |
(buffer (make-array buffer-size :element-type 'character)) | |
(buffer-start 0) ;; where to start filling the buffer from stream | |
(buffer-end 0) ;; where buffer filling from stream ended | |
(start 0) ;; start of current line within buffer (w.r.t. #\newline) | |
(end 0)) ;; end of current line within buffer | |
(declare (type function function) | |
(type string buffer) | |
(type fixnum buffer-end buffer-start start end) | |
(dynamic-extent buffer-end buffer-start start end)) | |
(flet ((callback (start end) | |
(funcall function | |
(adjust-array view | |
(- end start) | |
:displaced-to buffer | |
:displaced-index-offset start)))) | |
(declare (inline callback)) | |
(block nil | |
(tagbody | |
:buffer | |
;; Fill buffer with characters from the stream. When | |
;; nothing is read, we can exit the state machine. It is | |
;; however still possible that there are are characters | |
;; left between position zero and buffer-start: if so, | |
;; process that region with the callback function. | |
(setf buffer-end | |
(read-sequence buffer stream :start buffer-start)) | |
(when (= buffer-end buffer-start) | |
(when (plusp buffer-start) | |
(callback 0 buffer-start)) | |
(return)) | |
:next-line | |
;; Try to find the end of line. If we reach the end of the | |
;; buffer, go instead to :COPY, which shifts the content on | |
;; the "left" to make room for more characters to buffer. | |
(setf end | |
(or (position #\newline | |
buffer | |
:test #'char= | |
:start start | |
:end buffer-end) | |
(go :copy))) | |
;; The whole line fits in the buffer, from start to end. | |
(callback start end) | |
(setf start (1+ end)) | |
(go :next-line) | |
:copy | |
;; not enough room, copy the latest line fragment (the one | |
;; being currently read) at the beginning of the buffer. If | |
;; we already are at the beginning (ZEROP START), then we | |
;; try to extend to buffer instead. | |
(when (zerop start) | |
(go :extend)) | |
(replace buffer buffer :start2 start :end2 buffer-end) | |
(setf buffer-start (- buffer-end start)) | |
(setf start 0) | |
;; We shifted the current line on the left, because we did | |
;; not yet found a newline. Buffer more characters. | |
(go :buffer) | |
:extend | |
(let ((size (array-total-size buffer))) | |
(setf buffer | |
(adjust-array | |
buffer | |
(if (>= size *max-line-size*) | |
(restart-case | |
(error 'internal-buffer-overflow | |
:buffer buffer | |
:variable '*max-line-size* | |
:size size) | |
(ignore () | |
:report "Ignore the limit and extend the buffer." | |
(* size 2))) | |
(min *max-line-size* (* size 2))))) | |
(setf buffer-start size) | |
(go :buffer)))))))) |
I tried this code on my log file.
It takes 180ms to loop through lines when counting them:
(defun read-lines-4 (filename)
(with-open-file (in filename)
(let ((count 0))
(declare (type fixnum count))
(map-lines-on-shared-buffer
in
(lambda (s)
(declare (ignorable s))
(incf count)))
count)))
And 300-400ms when collecting strings:
(defun read-lines-4 (filename)
(with-open-file (in filename)
(uiop:while-collecting (collect-string)
(map-lines-on-shared-buffer
in
(lambda (s)
(collect-string (copy-seq s)))))))
Thanks for your (time)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See https://github.com/christophejunke/map-lines