Created
November 16, 2020 13:54
-
-
Save vonavi/bd8d5d72d005108f7cbdd1f835fe1f78 to your computer and use it in GitHub Desktop.
Fetch blog archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; fetch-blog.el --- Fetch blog archive | |
;; Copyright (C) 2020 Vladimir Ivanov | |
;; Author: Vladimir Ivanov <ivvl82@gmail.com> | |
;; Keywords: | |
;;; Commentary: | |
;; | |
;;; Code: | |
(require 'dom) | |
(require 'url) | |
(require 'url-expand) | |
(require 'url-parse) | |
(defconst website-list | |
'()) | |
(defvar-local blog-directory "blogs") | |
(defvar-local page-padding 25) | |
(defun format-page-url (website page-num) | |
(if (= page-num 1) | |
(car website) | |
(format (or (plist-get (cdr website) :page-fmt) "%s/page/%d/") | |
(car website) | |
page-num))) | |
(defun node-to-article (page-url node) | |
(let* ((hline (car (or (dom-by-tag node 'h1) | |
(dom-by-tag node 'h2) | |
(dom-by-tag node 'h3)))) | |
(link (car (dom-by-tag hline 'a)))) | |
(cons (url-expand-file-name (dom-attr link 'href) page-url) | |
(string-trim (dom-texts link))))) | |
(defun insert-articles (status website page-seq count buf) | |
(when (not status) | |
(error (format "Could not reach %s" (car website)))) | |
(let* ((page-url (format-page-url website (car page-seq))) | |
(page-dom | |
(libxml-parse-html-region (1+ url-http-end-of-headers) (point-max))) | |
(articles (mapcar (apply-partially #'node-to-article page-url) | |
(or (dom-by-tag page-dom 'article) | |
(dom-by-class page-dom "\\<post\\>"))))) | |
(with-current-buffer buf | |
(dolist (post (nreverse articles)) | |
(let ((item (1+ (mod count page-padding)))) | |
(when (= item 1) | |
(org-insert-heading) | |
(insert (format "Post numbers %d-%d [0/%d]\n" | |
(1+ count) | |
(+ count page-padding) | |
page-padding))) | |
(insert (format "%d. [ ] [[%s][%s]]\n" item (car post) (cdr post)))) | |
(incf count)) | |
(fetch-page-sequence website (cdr page-seq) count)))) | |
(defun fetch-page-sequence (website page-seq count) | |
(when page-seq | |
(let ((page-num (car page-seq))) | |
(message (format "Fetching page %d from %s..." page-num (car website))) | |
;; Do not retrieve URL too frequently | |
(sleep-for (+ 5 (random 10))) | |
(url-retrieve (format-page-url website page-num) | |
#'insert-articles | |
(list website page-seq count (current-buffer)))))) | |
;;;###autoload | |
(defun fetch-blog-archive (url) | |
(interactive (list (completing-read "Website: " | |
(mapcar #'car website-list) | |
nil | |
nil | |
"https://"))) | |
(let* ((website (assoc url website-list)) | |
(page-max (plist-get (cdr website) :page-max)) | |
(default-range (prin1-to-string `(1 . ,page-max))) | |
(prompt (format "Page range [default: %s]: " default-range)) | |
(page-range (read-from-minibuffer prompt nil nil t nil default-range)) | |
page-seq) | |
(setq page-range | |
(cons (max 1 (car page-range)) (min page-max (cdr page-range)))) | |
(setq page-seq | |
(nreverse (number-sequence (car page-range) (cdr page-range)))) | |
(let* ((basename (replace-regexp-in-string | |
"\\." | |
"_" | |
(url-host (url-generic-parse-url url)))) | |
(blog-file | |
(concat (file-name-as-directory blog-directory) basename ".org"))) | |
(switch-to-buffer (find-file-noselect blog-file)) | |
(goto-char (point-max)) | |
(fetch-page-sequence website page-seq 0)))) | |
(provide 'fetch-blog) | |
;;; fetch-blog.el ends here |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment