Last active
June 1, 2020 06:48
-
-
Save atomontage/fdc7c6f3b84214b31a8ea294695c5b90 to your computer and use it in GitHub Desktop.
Download PDFs from dl.acm.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; acm.el --- Download PDFs from dl.acm.org -*- lexical-binding: t; -*- | |
;; This is free and unencumbered software released into the public domain. | |
;; xristos@sdf.org | |
;; Modified: 2020-06-01 | |
(require 'url) | |
(require 'dom) | |
(require 'seq) | |
(require 'cl-lib) | |
(require 'subr-x) | |
(defvar acm-directory #'acm-find-directory | |
"Use this directory to save all PDFs. Must be a string or a function | |
that accepts one argument (url string) and returns a directory.") | |
(defvar acm-iterate t | |
"If non-nil, iterate over all 'next issue' links.") | |
(defvar acm-overwrite nil | |
"If non-nil, overwrite existing files otherwise skip download.") | |
(defun acm-find-directory (url) | |
(concat "/tmp/" (if (string-match "/toc/sigplan-lisppointers/\\(.*\\)" url) | |
(concat "acm/" (match-string 1 url)) | |
"acm"))) | |
;;; | |
;;; | |
;;; | |
(defvar-local acm--pos 0) | |
(defvar-local acm--next nil) | |
(defun acm--insert (fmt &rest args) | |
(save-excursion | |
(goto-char acm--pos) | |
(if fmt | |
(let ((str (concat (apply 'format fmt args) "\n"))) | |
(if (string-prefix-p "[+]" str) | |
(insert str) | |
(insert (concat " " str)))) | |
(insert "\n")) | |
(setq acm--pos (point)))) | |
(defun acm--error (target fmt &rest args) | |
(with-current-buffer (alist-get :buffer target) | |
(let ((str (apply 'format fmt args))) | |
(acm--insert "%s" str)))) | |
(defun acm--url (target &optional more) | |
(when (and more (not (string-prefix-p "/" more))) | |
(setq more (concat "/" more))) | |
(let ((parsed-url (alist-get :parsed target))) | |
(format "%s://%s%s" | |
(url-type parsed-url) | |
(url-host parsed-url) | |
(or more "")))) | |
(defun acm--title-to-file (title) | |
(let ((pairs '(("/" . " ") ("\n" . "") ("\\." . "") (" +" . " ")))) | |
(concat (seq-reduce (lambda (string pair) | |
(replace-regexp-in-string | |
(car pair) (cdr pair) string)) | |
pairs | |
title) | |
".pdf"))) | |
(defun acm--download-pdf (status target path pdf-url) | |
(unwind-protect | |
(if-let ((err (plist-get status :error))) | |
(progn (acm--error target "%s [%s]" (error-message-string err) pdf-url) | |
(acm--get-pdf target)) | |
(make-directory (file-name-directory path) t) | |
(let ((write-region-inhibit-fsync nil) | |
(coding-system-for-write 'no-conversion)) | |
(write-region url-http-end-of-headers (point-max) path nil 'nomsg)) | |
(with-current-buffer (alist-get :buffer target) | |
(let ((attrs (file-attributes path))) | |
(acm--insert "%8s %s %s" (file-attribute-size attrs) pdf-url path)) | |
(cl-incf (cdr (assoc :count target)))) | |
(acm--get-pdf target)) | |
(kill-buffer))) | |
(defun acm--get-pdf (target) | |
(with-current-buffer (alist-get :buffer target) | |
(if-let ((url (alist-get :url target)) | |
(titles (assoc :titles target)) | |
(title (pop (cdr titles))) | |
(file (car title)) | |
(pdf-url (cdr title))) | |
(let* ((dir (file-name-as-directory | |
(if (functionp acm-directory) | |
(funcall acm-directory url) | |
acm-directory))) | |
(path (concat dir file))) | |
(if (and (file-exists-p path) | |
(not acm-overwrite)) | |
;; Skip | |
(let ((attrs (file-attributes path))) | |
(acm--insert "%8s exists, skipping %s" (file-attribute-size attrs) path) | |
(acm--get-pdf target)) | |
;; Download | |
(condition-case err | |
(url-retrieve pdf-url #'acm--download-pdf (list target path pdf-url) t) | |
('error (acm--error target "%s [%s]" (error-message-string err) pdf-url) | |
(acm--get-pdf target))))) | |
;; No more links left for this target | |
(acm--insert "[+] Done [%d/%d]" | |
(alist-get :count target) | |
(alist-get :total target)) | |
(when (and acm-iterate acm--next) | |
(acm--insert nil) | |
(acm--retrieve (prog1 acm--next (setq acm--next nil))))))) | |
(defun acm--fetch-all-urls (status target) | |
(unwind-protect | |
(if-let ((err (plist-get status :error))) | |
(progn (acm--error target "%s" (error-message-string err)) | |
(acm--get-pdf target)) | |
(let* ((dom (libxml-parse-html-region url-http-end-of-headers (point-max))) | |
(dom-title (dom-by-class dom "issue-item__title")) | |
(dom-front-back (dom-by-class dom "issue-downloads__item")) | |
(next-url (dom-attr (dom-by-class dom "content-navigation__btn--next") 'href))) | |
(with-current-buffer (alist-get :buffer target) | |
(when (string-prefix-p "/toc/" next-url) | |
(setq acm--next (acm--url target next-url))) | |
(cl-loop | |
for len from 0 | |
for dt in dom-title | |
for a = (dom-by-tag dt 'a) | |
with titles = (assoc :titles target) do | |
(push (cons (acm--title-to-file (dom-text a)) | |
(acm--url target (replace-regexp-in-string | |
"abs" "pdf" (dom-attr a 'href)))) | |
(cdr titles)) | |
finally do | |
(acm--insert "Got %2d DOI links" len) | |
(cl-incf (cdr (assoc :total target)) len)) | |
(cl-loop | |
for len from 0 | |
for dfb in dom-front-back | |
for a = (dom-by-tag dfb 'a) | |
with titles = (assoc :titles target) do | |
(push (cons (acm--title-to-file (dom-attr a 'title)) | |
(acm--url target (dom-attr a 'href))) | |
(cdr titles)) | |
finally do | |
(when (> len 0) (acm--insert "Got %2d extra links" len)) | |
(cl-incf (cdr (assoc :total target)) len)) | |
(if-let ((more (dom-attr (dom-by-tag (dom-by-class dom "proceedingsLazyLoad") | |
'a) | |
'href))) | |
(acm--extract-urls target (acm--url target more)) | |
;; Begin downloading PDFs | |
(acm--get-pdf target))))) | |
(kill-buffer))) | |
(defun acm--extract-urls (target &optional url) | |
(unless url (setq url (alist-get :url target))) | |
(condition-case err | |
(url-retrieve url #'acm--fetch-all-urls (list target) t) | |
('error (acm--error target "%s [%s]" | |
(error-message-string err) url)))) | |
(defun acm--retrieve (url) | |
(let* ((parsed-url (url-generic-parse-url url)) | |
(path-and-query (url-path-and-query parsed-url)) | |
(base-url (format "%s://%s" | |
(url-type parsed-url) | |
(url-host parsed-url))) | |
(url (concat base-url (car path-and-query))) | |
(target (list (cons :url url) | |
(cons :base base-url) | |
(cons :parsed parsed-url) | |
(cons :buffer (current-buffer)) | |
(cons :count 0) | |
(cons :total 0) | |
(cons :titles nil)))) | |
(acm--insert "[+] %s" url) | |
(acm--extract-urls target))) | |
;;; | |
;;; (acm-retrieve "https://dl.acm.org/toc/sigplan-lisppointers/1987/1/1") | |
;;; | |
(defun acm-retrieve (url) | |
(interactive "sURL: ") | |
(let ((buf (generate-new-buffer "*acm*"))) | |
(switch-to-buffer buf) | |
(setq acm--pos 0 acm--next nil) | |
(acm--retrieve url))) | |
(provide 'acm) | |
;;; acm.el ends here |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment