Skip to content

Instantly share code, notes, and snippets.

@sabof
Last active December 20, 2015 21:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sabof/6201962 to your computer and use it in GitHub Desktop.
Save sabof/6201962 to your computer and use it in GitHub Desktop.
es-scrape-web-listing
;; -*- lexical-binding: t -*-
(require 'cl-lib)
(defun es-scrape-web-listing
(start-url collect-function next-url-function done-function
&optional pages-limit silent)
"Retrieve a list of things from a multi-page web document.
START-URL is the location from which to start scraping.
COLLECT-FUNCTION should return a list of collected things.
NEXT-URL-FUNCTION should be a function that will return the
link to the next page, or nil, when on the last page.
DONE-FUNCTION will be called once processing is finished, with
one argument - the results list.
One can limit the number of retrieved pages, by setting PAGES-LIMIT to a number.
When SILENT is non-nil, no progress messages will be displayed.
The function returns a function that will stop the scraping process."
(cl-assert (and start-url collect-function next-url-function done-function))
(let (retrieve-recursively
collected
next-url
( visited-urls (list start-url)))
(setq retrieve-recursively
(lambda (&rest args)
(goto-char (point-min))
(setq collected
(nconc collected
(funcall collect-function)))
(goto-char (point-min))
(unless silent
(message (format "Scraped \"%s\". Collected so far: %s"
next-url
(length collected))))
(cond ( (and pages-limit (<= (cl-decf pages-limit) 0))
(funcall done-function collected))
( (and (setq next-url (funcall next-url-function))
(not (member next-url visited-urls)))
(push next-url visited-urls)
(url-retrieve next-url retrieve-recursively))
( t (funcall done-function collected)))
(kill-buffer)))
(url-retrieve start-url retrieve-recursively))
(lambda ()
(setq next-url-function (lambda ()))))
;; Example: nczonline.net
(defvar ncz-posts nil)
(defvar ncz-scraper-stop nil)
(setq ncz-scraper-stop
(es-scrape-web-listing
"http://www.nczonline.net/"
(lambda ()
(cl-loop with link-holder
with name-holder
while (and (search-forward "class=\"post-snippet" nil t)
(re-search-forward "href=\"\\(?1:.+?\\)\"" nil t)
(setq link-holder (match-string 1))
(re-search-forward ">\\(?1:.+?\\)<" nil t)
(setq name-holder (match-string 1)))
collecting (cons link-holder name-holder)
))
(lambda ()
(ignore-errors
(search-forward "<div class=\"navigation\">")
(re-search-forward "<a href=\"\\(?1:.+?\\)\" >")
(match-string 1)))
(lambda (result)
(setq scrape-result result)
(message (concat "Done. "
(number-to-string
(length scrape-result))
" items found.")))))
(defun es-alist-to-org-links (alist)
(cl-dolist (pair alist)
(insert "- [ ] [[" (car pair) "][" (cdr pair) "]]\n")))
;; To convert the retrieved list to org format
;; (es-alist-to-org-links ncz-posts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment