Skip to content

Instantly share code, notes, and snippets.

@atomontage
Last active June 1, 2020 06:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atomontage/fdc7c6f3b84214b31a8ea294695c5b90 to your computer and use it in GitHub Desktop.
Save atomontage/fdc7c6f3b84214b31a8ea294695c5b90 to your computer and use it in GitHub Desktop.
Download PDFs from dl.acm.org
;;; acm.el --- Download PDFs from dl.acm.org -*- lexical-binding: t; -*-
;; This is free and unencumbered software released into the public domain.
;; xristos@sdf.org
;; Modified: 2020-06-01
(require 'url)
(require 'dom)
(require 'seq)
(require 'cl-lib)
(require 'subr-x)
(defvar acm-directory #'acm-find-directory
"Use this directory to save all PDFs. Must be a string or a function
that accepts one argument (url string) and returns a directory.")
(defvar acm-iterate t
"If non-nil, iterate over all 'next issue' links.")
(defvar acm-overwrite nil
"If non-nil, overwrite existing files otherwise skip download.")
(defun acm-find-directory (url)
(concat "/tmp/" (if (string-match "/toc/sigplan-lisppointers/\\(.*\\)" url)
(concat "acm/" (match-string 1 url))
"acm")))
;;;
;;;
;;;
(defvar-local acm--pos 0)
(defvar-local acm--next nil)
(defun acm--insert (fmt &rest args)
(save-excursion
(goto-char acm--pos)
(if fmt
(let ((str (concat (apply 'format fmt args) "\n")))
(if (string-prefix-p "[+]" str)
(insert str)
(insert (concat " " str))))
(insert "\n"))
(setq acm--pos (point))))
(defun acm--error (target fmt &rest args)
(with-current-buffer (alist-get :buffer target)
(let ((str (apply 'format fmt args)))
(acm--insert "%s" str))))
(defun acm--url (target &optional more)
(when (and more (not (string-prefix-p "/" more)))
(setq more (concat "/" more)))
(let ((parsed-url (alist-get :parsed target)))
(format "%s://%s%s"
(url-type parsed-url)
(url-host parsed-url)
(or more ""))))
(defun acm--title-to-file (title)
(let ((pairs '(("/" . " ") ("\n" . "") ("\\." . "") (" +" . " "))))
(concat (seq-reduce (lambda (string pair)
(replace-regexp-in-string
(car pair) (cdr pair) string))
pairs
title)
".pdf")))
(defun acm--download-pdf (status target path pdf-url)
(unwind-protect
(if-let ((err (plist-get status :error)))
(progn (acm--error target "%s [%s]" (error-message-string err) pdf-url)
(acm--get-pdf target))
(make-directory (file-name-directory path) t)
(let ((write-region-inhibit-fsync nil)
(coding-system-for-write 'no-conversion))
(write-region url-http-end-of-headers (point-max) path nil 'nomsg))
(with-current-buffer (alist-get :buffer target)
(let ((attrs (file-attributes path)))
(acm--insert "%8s %s %s" (file-attribute-size attrs) pdf-url path))
(cl-incf (cdr (assoc :count target))))
(acm--get-pdf target))
(kill-buffer)))
(defun acm--get-pdf (target)
(with-current-buffer (alist-get :buffer target)
(if-let ((url (alist-get :url target))
(titles (assoc :titles target))
(title (pop (cdr titles)))
(file (car title))
(pdf-url (cdr title)))
(let* ((dir (file-name-as-directory
(if (functionp acm-directory)
(funcall acm-directory url)
acm-directory)))
(path (concat dir file)))
(if (and (file-exists-p path)
(not acm-overwrite))
;; Skip
(let ((attrs (file-attributes path)))
(acm--insert "%8s exists, skipping %s" (file-attribute-size attrs) path)
(acm--get-pdf target))
;; Download
(condition-case err
(url-retrieve pdf-url #'acm--download-pdf (list target path pdf-url) t)
('error (acm--error target "%s [%s]" (error-message-string err) pdf-url)
(acm--get-pdf target)))))
;; No more links left for this target
(acm--insert "[+] Done [%d/%d]"
(alist-get :count target)
(alist-get :total target))
(when (and acm-iterate acm--next)
(acm--insert nil)
(acm--retrieve (prog1 acm--next (setq acm--next nil)))))))
(defun acm--fetch-all-urls (status target)
(unwind-protect
(if-let ((err (plist-get status :error)))
(progn (acm--error target "%s" (error-message-string err))
(acm--get-pdf target))
(let* ((dom (libxml-parse-html-region url-http-end-of-headers (point-max)))
(dom-title (dom-by-class dom "issue-item__title"))
(dom-front-back (dom-by-class dom "issue-downloads__item"))
(next-url (dom-attr (dom-by-class dom "content-navigation__btn--next") 'href)))
(with-current-buffer (alist-get :buffer target)
(when (string-prefix-p "/toc/" next-url)
(setq acm--next (acm--url target next-url)))
(cl-loop
for len from 0
for dt in dom-title
for a = (dom-by-tag dt 'a)
with titles = (assoc :titles target) do
(push (cons (acm--title-to-file (dom-text a))
(acm--url target (replace-regexp-in-string
"abs" "pdf" (dom-attr a 'href))))
(cdr titles))
finally do
(acm--insert "Got %2d DOI links" len)
(cl-incf (cdr (assoc :total target)) len))
(cl-loop
for len from 0
for dfb in dom-front-back
for a = (dom-by-tag dfb 'a)
with titles = (assoc :titles target) do
(push (cons (acm--title-to-file (dom-attr a 'title))
(acm--url target (dom-attr a 'href)))
(cdr titles))
finally do
(when (> len 0) (acm--insert "Got %2d extra links" len))
(cl-incf (cdr (assoc :total target)) len))
(if-let ((more (dom-attr (dom-by-tag (dom-by-class dom "proceedingsLazyLoad")
'a)
'href)))
(acm--extract-urls target (acm--url target more))
;; Begin downloading PDFs
(acm--get-pdf target)))))
(kill-buffer)))
(defun acm--extract-urls (target &optional url)
(unless url (setq url (alist-get :url target)))
(condition-case err
(url-retrieve url #'acm--fetch-all-urls (list target) t)
('error (acm--error target "%s [%s]"
(error-message-string err) url))))
(defun acm--retrieve (url)
(let* ((parsed-url (url-generic-parse-url url))
(path-and-query (url-path-and-query parsed-url))
(base-url (format "%s://%s"
(url-type parsed-url)
(url-host parsed-url)))
(url (concat base-url (car path-and-query)))
(target (list (cons :url url)
(cons :base base-url)
(cons :parsed parsed-url)
(cons :buffer (current-buffer))
(cons :count 0)
(cons :total 0)
(cons :titles nil))))
(acm--insert "[+] %s" url)
(acm--extract-urls target)))
;;;
;;; (acm-retrieve "https://dl.acm.org/toc/sigplan-lisppointers/1987/1/1")
;;;
(defun acm-retrieve (url)
(interactive "sURL: ")
(let ((buf (generate-new-buffer "*acm*")))
(switch-to-buffer buf)
(setq acm--pos 0 acm--next nil)
(acm--retrieve url)))
(provide 'acm)
;;; acm.el ends here
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment