Skip to content

Instantly share code, notes, and snippets.

@aisamanra
Created January 28, 2011 08:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aisamanra/799987 to your computer and use it in GitHub Desktop.
Save aisamanra/799987 to your computer and use it in GitHub Desktop.
;; guile-utf8.scm
;; A quickly hacked together set of functions to turn
;; utf-8-encoded text (which Guile sees as raw bytes) into
;; ASCII-encoded HTML, with a few other functions for
;; getting the code values out of a UTF-8 string.
;; anon-let is a hacky macro to create a new scope while
;; allowing definitions within it to bind in its enclosing
;; scope. It also allows define-local which only defines
;; within that scope. It is used here to close over constants
;; which needn't clutter the global namespace and to keep
;; small utility functions private.
(define-macro (anon-let clauses . body)
(define (forward body)
(map
(lambda (clause)
(if (list? (cadr clause))
`(define ,(caadr clause) #f)
`(define ,(cadr clause) #f)))
(filter
(lambda (clause)
(and (list? clause)
(equal? (car clause) 'define)))
body)))
(define (later body)
(map
(lambda (clause)
(cond ((and (list? clause)
(equal? (car clause) 'define-local))
`(define ,@(cdr clause)))
((or (not (list? clause))
(not (equal? (car clause) 'define)))
clause)
((list? (cadr clause))
`(set! ,(caadr clause)
(lambda ,(cdadr clause)
,@(cddr clause))))
(else
`(set! ,(cadr clause)
,@(cddr clause)))))
body))
`(begin
,@(forward body)
(let ,clauses
,@(later body))))
;; returns the first n characters of the string s
(define (string-take s n)
(substring s 0 n))
;; returns the string s without the first n characters
(define (string-drop s n)
(substring s n))
;; returns the integer value of the first character of s
(define (first-char s)
(char->integer (string-ref s 0)))
(anon-let
((_1b-mask #b+10000000) ;; All of these are masks used
(_mb-mask #b+11000000) ;; to identify whether a given byte
(_2b-mask #b+11100000) ;; begins a 1- to 4-byte codepoint
(_3b-mask #b+11110000) ;; or is is in the middle of a
(_4b-mask #b+11111000)) ;; codepoint.
;; These, in conjunction with the masks above, are used
;; to determine what part of a codepoint a given byte
;; constitutes.
(define-local (one-byte? s)
(= (logand (first-char s) _1b-mask) #b+00000000))
(define-local (two-byte? s)
(= (logand (first-char s) _2b-mask) #b+11000000))
(define-local (three-byte? s)
(= (logand (first-char s) _3b-mask) #b+11100000))
(define-local (four-byte? s)
(= (logand (first-char s) _4b-mask) #b+11110000))
(define-local (mid-byte? s)
(= (logand (first-char s) _mb-mask) #b+10000000))
;; Returns a list of strings where each (possibly multibyte)
;; string represents a single UTF-8 codepoint.
(define (ustring->list s)
(define (helper s l)
(cond ((equal? s "")
(reverse l))
((one-byte? s)
(helper (string-drop s 1)
(cons (string-take s 1) l)))
((two-byte? s)
(helper (string-drop s 2)
(cons (string-take s 2) l)))
((three-byte? s)
(helper (string-drop s 3)
(cons (string-take s 3) l)))
((four-byte? s)
(helper (string-drop s 4)
(cons (string-take s 4) l)))))
(helper s '()))
;; flips the bits in a single byte
(define-local (flip-byte x)
(logxor x #b+11111111))
;; Takes a string representing a single UTF-8 codepoint
;; and returns the integer representation of it. If the
;; string contains multiple codepoints, it returns the
;; last.
(define (uchar->integer s)
(define (helper s n)
(cond ((equal? s "")
n)
((one-byte? s)
(first-char s))
((two-byte? s)
(helper (string-drop s 1)
(logand (flip-byte _2b-mask)
(first-char s))))
((three-byte? s)
(helper (string-drop s 1)
(logand (flip-byte _3b-mask)
(first-char s))))
((four-byte? s)
(helper (string-drop s 1)
(logand (flip-byte _4b-mask)
(first-char s))))
((mid-byte? s)
(helper (string-drop s 1)
(logior (logand (flip-byte _mb-mask)
(first-char s))
(ash n 6))))))
(helper s 0))
;; Returns a list of integers representing the unicode
;; values of the characters in the UTF-8 string.
(define (ustring->intlist s)
(map uchar->integer (ustring->list s)))
;; Takes a UTF-8 string and converts all non-ASCII
;; codepoints to HTML escape sequences.
(define (utf-8-to-html s)
(string-join
(map (lambda (char)
(cond ((= char 10)
"<br/>")
((< char 128)
(string (integer->char char)))
(else
(format #f "&#~a;"
(number->string char)))))
(ustring->intlist s))
""))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment