Skip to content

Instantly share code, notes, and snippets.

@vonavi
Created November 16, 2020 13:54
Show Gist options
  • Save vonavi/bd8d5d72d005108f7cbdd1f835fe1f78 to your computer and use it in GitHub Desktop.
Save vonavi/bd8d5d72d005108f7cbdd1f835fe1f78 to your computer and use it in GitHub Desktop.
Fetch blog archive
;;; fetch-blog.el --- Fetch blog archive
;; Copyright (C) 2020 Vladimir Ivanov
;; Author: Vladimir Ivanov <ivvl82@gmail.com>
;; Keywords:
;;; Commentary:
;;
;;; Code:
(require 'dom)
(require 'url)
(require 'url-expand)
(require 'url-parse)
(defconst website-list
'())
(defvar-local blog-directory "blogs")
(defvar-local page-padding 25)
(defun format-page-url (website page-num)
(if (= page-num 1)
(car website)
(format (or (plist-get (cdr website) :page-fmt) "%s/page/%d/")
(car website)
page-num)))
(defun node-to-article (page-url node)
(let* ((hline (car (or (dom-by-tag node 'h1)
(dom-by-tag node 'h2)
(dom-by-tag node 'h3))))
(link (car (dom-by-tag hline 'a))))
(cons (url-expand-file-name (dom-attr link 'href) page-url)
(string-trim (dom-texts link)))))
(defun insert-articles (status website page-seq count buf)
(when (not status)
(error (format "Could not reach %s" (car website))))
(let* ((page-url (format-page-url website (car page-seq)))
(page-dom
(libxml-parse-html-region (1+ url-http-end-of-headers) (point-max)))
(articles (mapcar (apply-partially #'node-to-article page-url)
(or (dom-by-tag page-dom 'article)
(dom-by-class page-dom "\\<post\\>")))))
(with-current-buffer buf
(dolist (post (nreverse articles))
(let ((item (1+ (mod count page-padding))))
(when (= item 1)
(org-insert-heading)
(insert (format "Post numbers %d-%d [0/%d]\n"
(1+ count)
(+ count page-padding)
page-padding)))
(insert (format "%d. [ ] [[%s][%s]]\n" item (car post) (cdr post))))
(incf count))
(fetch-page-sequence website (cdr page-seq) count))))
(defun fetch-page-sequence (website page-seq count)
(when page-seq
(let ((page-num (car page-seq)))
(message (format "Fetching page %d from %s..." page-num (car website)))
;; Do not retrieve URL too frequently
(sleep-for (+ 5 (random 10)))
(url-retrieve (format-page-url website page-num)
#'insert-articles
(list website page-seq count (current-buffer))))))
;;;###autoload
(defun fetch-blog-archive (url)
(interactive (list (completing-read "Website: "
(mapcar #'car website-list)
nil
nil
"https://")))
(let* ((website (assoc url website-list))
(page-max (plist-get (cdr website) :page-max))
(default-range (prin1-to-string `(1 . ,page-max)))
(prompt (format "Page range [default: %s]: " default-range))
(page-range (read-from-minibuffer prompt nil nil t nil default-range))
page-seq)
(setq page-range
(cons (max 1 (car page-range)) (min page-max (cdr page-range))))
(setq page-seq
(nreverse (number-sequence (car page-range) (cdr page-range))))
(let* ((basename (replace-regexp-in-string
"\\."
"_"
(url-host (url-generic-parse-url url))))
(blog-file
(concat (file-name-as-directory blog-directory) basename ".org")))
(switch-to-buffer (find-file-noselect blog-file))
(goto-char (point-max))
(fetch-page-sequence website page-seq 0))))
(provide 'fetch-blog)
;;; fetch-blog.el ends here
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment