Skip to content

Instantly share code, notes, and snippets.

@youz
Created November 18, 2010 15:16
Show Gist options
  • Save youz/705102 to your computer and use it in GitHub Desktop.
Save youz/705102 to your computer and use it in GitHub Desktop.
htmlからDOMツリーっぽいリストを作成 (要 www-mode) #xyzzy
;;; -*- mode:lisp; package:dom -*-
(eval-when (:compile-toplevel :load-toplevel :execute)
(require "xml-http-request")
(require "www/www"))
(defpackage dom
(:use :lisp :editor))
(in-package "dom")
(export '(get-doc-from-url
get-doc-from-buffer
get-doc-from-file
tag attr contents
dumptree
find-node
find-node-by-tag
find-node-by-class)
)
(defvar *single-tags* '(:meta :link :img :input :br :hr :param :embed))
(defmacro accum (accfn &body body)
`(let (#1=#:gacc)
(flet ((,accfn (#2=#:ga) (push #2# #1#)))
,@body)
(nreverse #1#)))
(defmacro whenlet (var test &body body)
`(let ((,var ,test)) (when ,var ,@body)))
(defun make-tree (parsed)
(labels
((kw (sym) (intern (string-downcase (symbol-name sym)) "keyword"))
(singletagp (elm) (or (find (kw (car elm)) *single-tags*)
(string-match "/$" (symbol-name (car elm)))))
(rec (tag)
(list* (kw (car tag))
(mapcar #1=#'(lambda (attr) (cons (kw (car attr)) (cdr attr)))
(cadr tag))
(accum a
(loop for elm = (pop parsed) do
(cond ((null elm)
(a `(:unmatched-open-tag ((:tag ,(kw (car tag))))))
;(warn "unmatched open tag: <~A>~%" (car tag))
(loop-finish))
((stringp elm)
(when (string-match "[^ \t\n]" elm) (a elm)))
((consp elm)
(if (cadr (assoc 'close (cadr elm)))
(if (eq (car tag) (car elm))
(loop-finish)
(a `(:unmatched-close-tag ((:tag ,(kw (car elm)))))))
;(warn "unmatched close tag: </~A>~%" (car elm)))
(a (if (singletagp elm)
`(,(kw (car elm)) ,(mapcar #1# (cadr elm)))
(rec elm)))))
(t (warn "~S~%" elm))))))))
(when (setf parsed (member 'www::html parsed :key #'safe-car))
(rec (pop parsed)))))
(defun parse-html-string (html)
(save-excursion
(let ((buf (create-new-buffer "*temp*")))
(set-buffer buf)
(insert html)
(www::www-delete-comment)
(make-local-variable 'www::www-charset)
(setq www::www-charset (www::www-get-encode html))
(prog1
(make-tree (nreverse (www::www-parse-html)))
(delete-buffer buf)))))
(defun get-doc-from-url (url)
(let ((html (xhr:xhr-get url :key #'xhr:xhr-response-text :since :epoch)))
(when html
(parse-html-string html))))
(defun get-doc-from-buffer (buf)
(save-excursion
(set-buffer buf)
(parse-html-string (buffer-substring (point-min) (point-max)))))
(defun get-doc-from-file (path)
(save-excursion
(let ((buf (create-new-buffer "*html*")))
(set-buffer buf)
(setq need-not-save t)
(insert-file path)
(prog1 (get-doc-from-buffer buf)
(kill-buffer buf)))))
(defun dumptree (tree &optional (stream t) (indent 0))
(format t "~&~VT" (* indent 2))
(cond ((atom tree) (format stream "~S" tree))
((listp tree)
(format stream "(~A (~{(~{~A ~S~})~^ ~})" (car tree) (cadr tree))
(if (cddr tree)
(progn
(format stream "~%")
(dolist (child (cddr tree))
(dumptree child stream (1+ indent)))))
(format stream ")"))
(t (format stream "~S" tree))))
(defmacro tag (obj)
`(car ,obj))
(defmacro attr (obj key)
`(cadr (assoc ,key (cadr ,obj))))
(defmacro contents (obj)
`(cddr ,obj))
(defun tagp (obj)
(and (listp obj) (keywordp (tag obj))))
(defun find-node (tree pred &key (recursive t))
(accum a
(labels
((rec (obj)
(cond ((tagp obj)
(when (funcall pred obj)
(a obj))
(whenlet elms (and recursive (contents obj))
(mapc #'rec elms)))
((listp obj) (mapc #'rec obj))
((atom obj) nil))))
(rec tree))))
(defun find-node-by-tag (tree tag)
(find-node tree
(lambda (obj) (eql (car obj) tag))))
(defun find-node-by-class (tree pattern)
(find-node
tree
(lambda (obj)
(whenlet class (attr obj :class)
(string-match pattern class)))))
(provide "dom")
; (let ((moss-menu (get-html "http://www.mos.co.jp/menu/all/")))
; (find-node-by-class moss-menu "^heightLine-1"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment