Skip to content

Instantly share code, notes, and snippets.

@html
Last active November 8, 2017 04:47
Show Gist options
  • Save html/5283629 to your computer and use it in GitHub Desktop.
Save html/5283629 to your computer and use it in GitHub Desktop.
Utilities I use for parsing, it depends on "php funcs for cl" https://gist.github.com/html/4707958
(defvar *cache-dir* '(:relative "cache"))
(defvar *cache-enabled-p* nil)
(defvar *drakma-request-max-tries* nil) ;
(defvar *character-for-wrong-utf-8-chars* #\@)
; V5
(defun drakma-request (&rest args)
"Gets url contents using drakma:http-request.
Tries several times to do it on error.
When *drakma-request-max-tries* is NIL tries forever
Otherwise tries *drakma-request-max-tries* times
Replaces wrong utf-8 characters with *character-for-wrong-utf-8-chars* upon encoding error
"
; posible errors 'drakma::drakma-simple-error
; usocket:timeout-error
(let ((result)
(attempt 0))
(loop
while (or (not *drakma-request-max-tries*)
(< attempt *drakma-request-max-tries*))
do
(incf attempt)
(setf result (ignore-errors
(multiple-value-list
(handler-bind ((flexi-streams:external-format-encoding-error
#'(lambda (c)
(use-value *character-for-wrong-utf-8-chars*))))
(apply #'drakma:http-request args)))))
(if result
(return-from drakma-request (apply #'values result))
(progn
(format t "Failed to get result from ~A, trying again~%" (car args))
(sleep 3))))))
; V3
(defun get-content-from-url(&rest args)
"Gets content from url, caches it in *cache-dir* directory if *cache-enabled-p* is true. Parameters are similar to drakma-request"
(when (not *cache-enabled-p*)
(return-from get-content-from-url (apply #'drakma-request args)))
(let* ((url (car args))
(cached-filename (make-pathname
:directory *cache-dir*
:name (cl-ppcre:regex-replace-all "^[a-zA-Z0-9]" url "-")))
(content))
(if (probe-file cached-filename)
(file-get-contents cached-filename)
(progn
(setf content (apply #'drakma-request args))
(when content (file-put-contents cached-filename content))
content))))
(defun pretty-print-dom-recursive (object &optional (stream t) (indent-level 0))
(unless (zerop indent-level)
(format stream "~%"))
(dotimes (i indent-level)
(format stream " "))
(cond
((consp object)
(loop for i in object do
(pretty-print-dom-recursive i stream (1+ indent-level))))
((subtypep (type-of object) 'array )
(loop for i across object do
(pretty-print-dom-recursive i stream (1+ indent-level))))
((equal (type-of object) 'rune-dom::text)
(format stream "~A" (dom:node-value object)))
(t (progn
(print-unreadable-object (object stream)
(princ (dom:tag-name object) stream)
(dom:do-node-map (i (dom:attributes object))
(format stream " ~A=\"~A\"" (dom:name i) (dom:value i)))
(dom:do-node-list (i (dom:child-nodes object))
(pretty-print-dom-recursive i stream (1+ indent-level))))))))
(defun pretty-print-dom-recursive-to-string (object)
(with-output-to-string (s)
(pretty-print-dom-recursive object s)))
(defun get-node-text (dom)
"Recursive function for getting node text"
(if (dom:text-node-p dom)
(dom:node-value dom)
(join
(loop for i across (dom:child-nodes dom) collect (get-node-text i)))))
(defun get-nodes-text (list)
(join (mapcar #'get-node-text list)))
(defun replace-utf-8-sequences(string)
(cl-ppcre:regex-replace-all
"\\\\u[A-Fa-f0-9]{4}"
string
(lambda (target-string start end match-start match-end &rest args)
(string (code-char
(parse-integer
(subseq target-string (+ match-start 2) match-end)
:radix 16))))))
(defun pprint-json (json &optional stream &key (indent-expr #\Tab))
(let ((level 0)
(prev-char )
(in-quotes-p)
(ends-line-level)
(ret))
(setf ret (with-output-to-string (result)
(loop for char across json do
(let ((new-line-level)
(post ""))
(when ends-line-level
(setf new-line-level ends-line-level)
(setf ends-line-level nil))
(cond
((and
(char= char #\")
(or (not prev-char)
(not (char= prev-char #\\))))
(setf in-quotes-p (not in-quotes-p)))
((not in-quotes-p)
(case char
((#\} #\])
(decf level)
(setf ends-line-level nil)
(setf new-line-level level))
((#\{ #\[)
(incf level)
(setf ends-line-level level))
(#\, (setf ends-line-level level))
(#\: (setf post " "))
((#\Space #\Tab #\Newline #\Return)
(setf char nil)
(setf ends-line-level new-line-level)
(setf new-line-level nil)))))
(when new-line-level
(format result "~%")
(dotimes (i new-line-level)
(format result "~a" indent-expr)))
(format result "~A~A" char post)
(setf prev-char char)))))
(if stream
(write-string ret stream)
ret)))
(defun string-trim-spaces (string)
; !!IMPORTANT, below are two different spaces
(string-trim "  " string))
(defun dom-element-empty-p (dom)
(zerop (length (string-trim-spaces (get-node-text dom)))))
; v2
(defun normalize-string-spaces (str)
(ppcre:regex-replace-all "[\\s ]+" str " "))
(defun remove-newlines (string)
(ppcre:regex-replace-all "\\n" string ""))
(defun transform-to-cells-without-spans (trs)
(let* ((cells-length
(loop for i across (dom:child-nodes (first trs))
sum (if (dom:has-attribute i "colspan")
(parse-integer (dom:get-attribute i "colspan"))
1)))
(cells-vertical-length (length trs))
(result (make-array (list cells-vertical-length cells-length )
:element-type 'dom:node)))
(let ((x 0)
(y 0)
(row)
(cell))
(flet ((maybe-shift-x ()
(loop while (and (not (= x cells-length))
(not (numberp (aref result y x))))
do
(incf x))))
(loop for row in trs
do
(loop for cell across (dom:child-nodes row) do
;(format t "~A ~A ~A~%" x y cell)
(unless cell
(return))
(cond
((and
(dom:has-attribute cell "colspan")
(dom:has-attribute cell "rowspan"))
;(format t "Has rowspan and colspan ~A ~A~%" x y)
(loop for i from 1
to (parse-integer (dom:get-attribute cell "colspan")) do
(loop for j from 0
to (1- (parse-integer (dom:get-attribute cell "rowspan"))) do
(setf (aref result (+ y j) x) cell))
(incf x)))
((dom:has-attribute cell "colspan")
;(format t "Has colspan ~A ~A~%" x y)
(loop for i from 1 to (parse-integer (dom:get-attribute cell "colspan")) do
(maybe-shift-x)
(or
(ignore-errors
(setf (aref result y x) cell)
(incf x))
(progn
(warn "Step out of bounds on rowspan with x=~A y=~A" x y)
(return)))))
((dom:has-attribute cell "rowspan")
(progn
;(format t "Has rowspan ~A ~A~%" x y)
(maybe-shift-x)
(loop for i from 0 to (1- (parse-integer (dom:get-attribute cell "rowspan"))) do
(or
(ignore-errors (setf (aref result (+ y i) x) cell))
(progn
(warn "Step out of bounds on rowspan with x=~A y=~A" x y)
(return))))
(incf x)))
(t
(progn
(maybe-shift-x)
(or
(ignore-errors
(setf (aref result y x) cell)
(incf x))
(warn "Step out of bounds x=~A y=~A" x y))))))
(incf y)
(setf x 0))))
result))
(defun matches-count (re str)
(length (ppcre:all-matches-as-strings re str)))
(defun render-html-to-string (document)
(with-output-to-string (out)
(dom:map-document
(cxml:make-character-stream-sink
out
:indentation 2
:canonical nil
:omit-xml-declaration-p t)
document)))
; V2
(defun nodes-list-to-document (nodes)
"Converts dom elements to separate document"
(let ((document (rune-dom:create-document (first nodes))))
(loop for i in (cdr nodes) do
(dom:append-child document (dom:import-node document i t)))
document))
;
(defun dom-list-to-document (dom-list)
(error "Use nodes-list-to-document instead"))
(defun write-html (file document)
(with-open-file (out file :direction :output :if-does-not-exist :create :if-exists :supersede)
(dom:map-document
(cxml:make-character-stream-sink
out
:indentation 2
:canonical nil
:omit-xml-declaration-p t)
document)))
(defun json-single-to-double-quotes (string)
(ppcre:regex-replace-all
"\\\\'"
(ppcre:regex-replace-all
"([^\\\\])'"
(ppcre:regex-replace-all "\"" string "\\\"")
"\\1\"")
"'"))
(defun parse-string(string)
(chtml:parse string (cxml-dom:make-dom-builder)))
(defun parse-xml-string (string)
(cxml:parse string (cxml-dom:make-dom-builder)))
; V2
(defun parse-url(&rest args)
"Parses url contents to cxml-dom object. Parameters are similar to drakma:http-request. Uses get-content-from-url instead"
(chtml:parse
(apply #'get-content-from-url args)
(cxml-dom:make-dom-builder)))
; V1
(defun parse-url-using-curl (url)
(chtml:parse
(with-output-to-string (s)
(external-program:run "curl" (list url) :output s :wait t))
(cxml-dom:make-dom-builder)))
(defun parse-xml-url (&rest args)
(cxml:parse
(apply #'get-content-from-url args)
(cxml-dom:make-dom-builder)))
(defun id-equals-selector (id)
(lambda (item)
(and
(dom:has-attribute item "id")
(string= (dom:get-attribute item "id") id))))
(defun remove-first-col (trs)
(loop for i in trs do
(let ((tds (loop for j across (dom:child-nodes i) collect j)))
(dom:remove-child
(dom:parent-node (first tds))
(first tds))))
trs)
(defun remove-last-col (trs)
(loop for i in trs do
(let ((tds (loop for j across (dom:child-nodes i) collect j)))
(dom:remove-child
(dom:parent-node (car (last tds)))
(car (last tds)))))
trs)
; V1
(defun string-empty-p (str)
(or (not str)
(zerop
(length
(string-trim (list #\Newline #\Space #\Return) str)))))
; V2
; Todo: support for ./some-link with page uri parameter
(defun get-links-from-html (html &key page-uri)
"Extracts links from page.
Extracts only links from a elements with href attribute set.
Requires html base tag.
Skips malformed urls and prints message to standard OUTPUT
Uses quri"
(let ((base-uri)
(doc (parse-string html))
(uri-temp-var))
; Setting base uri
(progn
(setf base-uri (css:query "base" doc))
(when base-uri
(assert (= (length base-uri) 1))
(setf base-uri (quri:uri (dom:get-attribute (car base-uri) "href")))))
(remove-if
#'null
(loop for i in (css:query "a" doc)
if (dom:has-attribute i "href")
collect (handler-case
(quri:merge-uris (quri:uri (dom:get-attribute i "href")) base-uri)
(quri:uri-malformed-string (err)
(progn
(format t "Skipping wrong url ~A~%" (dom:get-attribute i "href"))
nil)))))))
; Todo: support for ./some-link with page uri parameter
(defun get-links-from-html-2 (html &key page-uri)
"Extracts links from page.
Extracts only links from a elements with href attribute set.
Requires html base tag.
Skips malformed urls and prints message to standard OUTPUT
Uses puri package for uris"
(let ((base-uri)
(doc (parse-string html))
(uri-temp-var))
; Setting base uri
(progn
(setf base-uri (css:query "base" doc))
(when base-uri
(assert (= (length base-uri) 1))
(setf base-uri (puri:uri (dom:get-attribute (car base-uri) "href")))))
(remove-if
#'null
(loop for i in (css:query "a" doc)
if (dom:has-attribute i "href")
collect (handler-case
(puri:merge-uris (puri:uri (dom:get-attribute i "href")) base-uri)
(puri:uri-parse-error ()
(progn
(format t "Skipping wrong url ~A~%" (dom:get-attribute i "href"))
nil)))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment