Skip to content

Instantly share code, notes, and snippets.

@Jach
Last active May 6, 2023 08:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jach/85b8b1af50996366b4059016b02259d2 to your computer and use it in GitHub Desktop.
Save Jach/85b8b1af50996366b4059016b02259d2 to your computer and use it in GitHub Desktop.
;;;; For years I had a script that used the twitter free API to get my account's (@jachy) likes, and if they were
;;;; images or videos, download a local copy of them because I was sick of accounts being suspended and losing some memes/art
;;;; forever.
;;;; Anyway, because the free tier is no more, here is a very dirty Common Lisp script to replace it.
;;;; It requires Selenium (expected to be running before this script, e.g. with `java -jar selenium-server-4.1.2.jar standalone`
;;;; and I'm assuming Firefox rather than Chrome is being used) to load up the likes page and continue to scroll down the page
;;;; for a fixed duration of time, frequently polling the page source and extracting media URLs from it. It handles pictures,
;;;; mp4 videos, and gifs. It uses `yt-dlp` (fork of youtube-dl) to get the mp4 URLs, uses `ls` to check for already-downloaded
;;;; existence (needed for lazy glob patterns), and `wget` to do the actual download.
;;;; By default it runs for 5 minutes, my cronjob is setup to run the script daily, but the first run I let it go for like
;;;; 10 hours to try and get as much as possible. It actually found stuff not returned by the API call... additionally it gets
;;;; stuff through quote-tweets as a side effect.
(defpackage #:get-likes
(:use #:cl))
(in-package :get-likes)
(ql:quickload :cl-webdriver-client)
(ql:quickload :cl-ppcre)
(ql:quickload :com.inuoe.jzon)
(use-package :webdriver-client)
(defun parse-bool (s)
"Turns TRUE or FALSE strings to T or nil"
(if (equal s "TRUE")
t
nil))
(defmacro updatef (thing function)
`(setf ,thing (funcall ,function ,thing)))
(defun parse-netscape-cookies (path)
"Parses a netscape cookies.txt file (http://fileformats.archiveteam.org/wiki/Netscape_cookies.txt)
into a list of lists of just each cookie"
(let ((contents (uiop:read-file-lines path)))
(dotimes (i 4)
(pop contents))
(mapcar (lambda (line)
(let ((parts (cl-ppcre:split "\\t" line)))
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace ".twitter" part "twitter")))
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace "#HttpOnly_.twitter" part "twitter")))
(updatef (elt parts 0) (lambda (part) (cl-ppcre:regex-replace "#HttpOnly_twitter" part "twitter")))
(updatef (elt parts 1) #'parse-bool)
(updatef (elt parts 3) #'parse-bool)
(updatef (elt parts 4) #'parse-integer)
parts))
contents)))
(defun set-cookies (cookies)
(dolist (cookie cookies)
(destructuring-bind (host subdomains path secure? expiry name value) cookie
(declare (ignore subdomains))
(setf (cookie) (make-cookie name value :path path :domain host :secure secure? :expiry expiry)))))
(defparameter *likes-folder* "/path/to/twitter_likes/")
(defparameter *img-regex* "src=\"(https://pbs.twimg.com/media/([a-zA-Z0-9_-]+?)\\?format=(jpg|jpeg|png))&name=")
(defparameter *vid-regex* "(href=\"(/[a-zA-Z0-9_-]+?/status/[0-9]+?)\" dir=\"ltr\")(?:(?!status).)+?poster=\"(https://pbs.twimg.com/ext_tw_video_thumb/[a-zA-Z0-9_-]+/pu/img/[a-zA-Z0-9_-]+?.(jpg|jpeg|png))\"")
(defparameter *gif-regex* "(https://video.twimg.com/tweet_video/([a-zA-Z0-9_-]+?).mp4)")
;(start-interactive-session)
(start-interactive-session (make-capabilities :always-match
'((browser-name . "firefox")
("moz:firefoxOptions" . (("args" . #("-headless"))) )
)))
(defparameter *likes-url* "https://twitter.com/jachy/likes")
(defparameter *resources* (list)
"List of image resource names, i.e. 123.jpg, unless it's a full mp4 url")
(defparameter *vid-resources* (list)
"List of a pairs of (tweet-url . thumbnail-url)")
(defun recollect-imgs (src)
(setf src (cl-ppcre:regex-replace-all "\\n" src " "))
(cl-ppcre:do-register-groups (full id ext) (*img-regex* src)
(declare (ignore full))
(setf *resources* (adjoin (format nil "~a.~a" id ext)
*resources*
:test #'string-equal)))
; also look for 'gifs'
(setf *resources* (union *resources* (cl-ppcre:all-matches-as-strings *gif-regex* src) :test #'string-equal)))
(defun recollect-vids (src)
(setf src (cl-ppcre:regex-replace-all "\\n" src " "))
(cl-ppcre:do-register-groups (_ tweet-id thumb-id) (*vid-regex* src)
(declare (ignore _))
(setf *vid-resources* (adjoin (cons (uiop:strcat "https://twitter.com" tweet-id)
thumb-id)
*vid-resources*
:test #'equal))))
(setf (url) *likes-url*) ; need to visit domain at least once before we can set cookies...
(set-cookies (parse-netscape-cookies (uiop:strcat *likes-folder* "cookies.txt")))
(setf (url) *likes-url*)
(let ((start (get-universal-time))
(progress 0)
(last-progress 0)
(collection-time (* 60 5))
(recollect-every 1)
(last-page-src ""))
(loop while (< progress collection-time)
do
(setf progress (- (get-universal-time) start))
(when (>= (- (get-universal-time) last-progress) recollect-every)
(setf last-progress (get-universal-time))
(let ((src (page-source)))
(unless (equal src last-page-src)
(setf last-page-src src)
(recollect-imgs src)
(recollect-vids src))))
(webdriver-client-utils:send-key :down-arrow)))
(recollect-imgs (page-source))
(recollect-vids (page-source))
(stop-interactive-session)
(defun downloaded? (file-id)
(let ((dirs (list ""))) ; I had a lot of images so started to organize some into subdirs post-download, add them here if you want to do the same like "memes/" "info/" etc.
(dolist (dir dirs)
(alexandria:if-let ((file-exists? (not (equal "" (with-output-to-string (s)
(handler-case (uiop:run-program (uiop:strcat "ls " *likes-folder* dir "*-" file-id) :output s)
(uiop:subprocess-error nil)))))))
(return-from downloaded? t))))
nil)
(defun last-prefix-n ()
; note if images are moved to subdirs, don't move the most recent image as that is used for the last-prefix-n...
(parse-integer (first
(uiop:split-string
(car (last (sort
(remove-if-not (lambda (n) (cl-ppcre:all-matches "^[0-9]" n))
(mapcar #'pathname-name (uiop:directory-files *likes-folder*)))
#'string<)))
:separator '(#\-)))))
(defun thumb-name (url)
(car (last (cl-ppcre:split "/" url))))
(defun get-vid-urls (tweet)
(let ((json (com.inuoe.jzon:parse
(with-output-to-string (s)
(uiop:run-program (list "yt-dlp" "-j" tweet) :output s)))))
(let* ((formats (gethash "formats" json)))
(map 'list (lambda (f) (gethash "url" f))
(remove-if-not (lambda (f) (equal "https" (gethash "protocol" f)))
formats)))))
(defparameter *name-format-str* "~a~5,'0d-~a")
(defun save-video (vid-urls prefix)
(dolist (url vid-urls)
(let ((name (car (last (cl-ppcre:split "/" url)))))
(format t "Trying vid ~a~%" name)
(uiop:run-program (list "wget"
"-O" (format nil *name-format-str* *likes-folder* prefix name)
url))
(incf prefix)))
(length vid-urls))
(defun download-resources ()
(let ((prefix (1+ (last-prefix-n))))
(dolist (resource *resources*)
(if (search ".mp4" resource)
(let ((gif-name (car (last (cl-ppcre:split "/" resource)))))
(unless (downloaded? gif-name)
(format t "Trying ~a~%" gif-name)
(uiop:run-program (list "wget"
"-O" (format nil *name-format-str* *likes-folder* prefix gif-name)
resource))
(incf prefix)))
(unless (downloaded? resource)
(format t "Trying ~a~%" resource)
(uiop:run-program (list "wget"
"-O" (format nil *name-format-str* *likes-folder* prefix resource)
(format nil "https://pbs.twimg.com/media/~a?name=orig" resource)))
(incf prefix))))
(dolist (resource *vid-resources*)
(let* ((tweet-url (car resource))
(thumb-url (cdr resource))
(thumb (thumb-name thumb-url)))
(unless (downloaded? thumb)
(format t "Trying thumb ~a~%" thumb)
(uiop:run-program (list "wget"
"-O" (format nil *name-format-str* *likes-folder* prefix thumb)
(uiop:strcat thumb-url "?name=orig")))
(incf prefix)
(incf prefix
(save-video (get-vid-urls tweet-url) prefix)))))))
(download-resources)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment