sardisson/Thumbnail image from Micro.blog post.applescript

## Thumbnail image from Micro.blog post.applescript
(*
 * Thumbnail image from Micro.blog post
 * Creates a thumbnail from the first image present in a given Micro.blog post
 * v1.0.7
 * 2019-08-18
 * https://gist.github.com/sardisson/e50286e17c8f2f39bfec9429ed5f46cd
 *)

on run
	-- gruber's get browser routine, with a twist to exclude non-Apple/Google browsers
	set _browser to GetCurrentApp()
	if _browser is not in {"Safari", "WebKit", "Safari Technology Preview", "Chrome", "Google Chrome"} then
		set _browser to GetDefaultWebBrowser()
		if _browser is not in {"Safari", "WebKit", "Safari Technology Preview", "Chrome", "Google Chrome"} then
			display dialog "Sorry, this script only works with Safari and Chrome" with icon stop buttons {"Cancel"} default button 1
			error number -128
		end if
	end if

	-- set up the HTML skeleton in variables
	set theHTML to "<html><head><title>Photos from Micro.blog</title><meta charset='utf-8'></head><body>"
	set theHTMLend to "</body></html>"

	-- takes a return-delimited list of URLs
	set inputText to text returned of (display dialog "Please enter a return-delimited list of URLs of Micro.blog posts to thumbnail" default answer "" with icon note) as string
	if inputText is "" then error number -128

	-- split the input into a List by paragraph
	-- this should never fail, because it's either "" (handled above) or at least 1 paragraph
	set postList to every paragraph of inputText

	-- used to adjust thumbnail size to fit in a window based on final count of images
	-- ideally the thumbnail size would be set correctly beforehand, but since it is written into the HTML as part of processing each image, and bogus URLs are filtered out as part of the image-processing loop, increment a count each time we successfully write an image-thumbnail HTML snippet
	set theCount to 0

	-- handle each post URL
	repeat with thePost in postList
		try -- using a try block to fake "continue" -- https://stackoverflow.com/a/6007211
			-- skip any non-Micro.blog URLs in the list
			if thePost does not start with "https://micro.blog/" then error 0
			--log thePost

			-- get the id of the Micro.blog post to construct the HTML div id to look in for the image
			set theTIDs to AppleScript's text item delimiters
			set AppleScript's text item delimiters to "/"
			try
				set thePostID to last text item of thePost as string
				set AppleScript's text item delimiters to theTIDs
			on error theErr number errNum
				set AppleScript's text item delimiters to theTIDs
				error number -128
			end try

			-- set up the commands for do JavaScript to get the img src and alt
			-- set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[0].src"
			set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].querySelector('img:not(.wp-smiley)').src;"
			-- XXX ideally filter out WP-Emoji images and retry with img[1].src and img[1].alt -- see below for variants
			-- XXX probably set up a small loop for jsImgSrcCommand/jsImgAltCommand that goes from 0 to 5, get node[n], checks for NOT the WP Emoji URL and exits, else tries again.
			-- but the problem is that the commands are run later, by each browser; this just defines the commands that are run, using different syntax, by each browser. not sure how to refactor
			-- maybe have this as a function, which internally loops node[n] until it gets a non-empty, non-WP emoji result (and then fetches the alt), and each browser call it, with loop-internal separation for browser types? Why did Google have to fork 'do JavaScript'?!
			(**
			repeat with n from 0 to 5
					set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[" & n & "].src"
					set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[" & n & "].alt"
					-- if the first value returned does not contain WP Emoji, then it’s what we want and we can skip checking the others
					if jsImgSrcCommand does not contain "https://s.w.org/images/core/emoji/" then exit repeat
					-- https://s.w.org/images/core/emoji/12.0.0-1/72x72/1f4f8.png for self-hosted
					-- https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f490.png for wp.com
			end repeat
			**)
			-- set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[0].alt"
			set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].querySelector('img:not(.wp-smiley)').getAttribute('alt');"
			-- if alt is empty, grab the post text/title in order to have a fallback in at least some cases; long strings truncated later
			set jsTitleCommand to "document.getElementsByTagName('title')[0].textContent.trim()"

			-- XXX https://micro.blog/bzz/3537228 fails to fetch image for some reason (but fetches title-as-caption fine)

			-- fetch image URL from Google Chrome
			if _browser is in {"Chrome", "Google Chrome"} then
				tell application "Google Chrome"
					open location thePost
					-- https://apple.stackexchange.com/q/343624
					repeat until (loading of active tab of front window is false)
						delay 0.25
					end repeat
					tell (active tab of front window) to set theImg to execute javascript jsImgSrcCommand
					tell (active tab of front window) to set theAlt to execute javascript jsImgAltCommand
					-- filter out bogus alt and set to blank to force fetching post content
					-- if theAlt is in {"mp-photo-alt[]=", "mp-photo-alt[]=mp-photo-alt[]="} then set theAlt to ""
					if ((theAlt starts with "mp-photo-alt[]=") or (theAlt is missing value)) then set theAlt to ""
					-- if there's no alt, make one up from the post content (as reflected in the page title)
					if theAlt = "" then tell (active tab of front window) to set theAlt to my truncateTitle(execute javascript jsTitleCommand)
					close active tab of front window -- try guard?
				end tell
			end if
			-- end Google Chrome

			-- fetch image URL from Apple's WebKit browsers
			if _browser is in {"Safari", "WebKit", "Safari Technology Preview"} then
				using terms from application "Safari"
					tell application _browser
						open location thePost
						tell current tab of front window
							-- this is an ugly hack, but the standard "until there's document source" method seems crash-happy here, and we really want the DOM+specific src anyway, so may as well wait for exactly that
							-- XXX this fails in some versions of Safari :-(
							-- XXX this either needs a timeout or to look for some other DOM element, because pages with no images just sit forever: https://micro.blog/numericcitizen/4141716
							repeat until (do JavaScript jsImgSrcCommand) is not missing value -- or ≠ ?
								delay 0.25 -- allow a bit of time to breathe
							end repeat
							-- ideally it would be possible to repeat until we got the src and then assign the result, but the repeat eats the result, so get it again
							set theImg to do JavaScript jsImgSrcCommand
							set theAlt to do JavaScript jsImgAltCommand
							-- filter out bogus alt and set to blank to force fetching post content
							-- if theAlt is in {"mp-photo-alt[]=", "mp-photo-alt[]=mp-photo-alt[]="} then set theAlt to ""
							if ((theAlt starts with "mp-photo-alt[]=") or (theAlt is missing value)) then set theAlt to ""
							-- if there's no alt, make one up from the post content (as reflected in the page title)
							if theAlt = "" then set theAlt to my truncateTitle(do JavaScript jsTitleCommand)
						end tell
						close current tab of front window -- try guard?
					end tell
				end using terms from
			end if
			--end Apple's WebKit browsers

			-- strip any previous photos.m.b url segment (inline thumbs)
			-- so far always 29 chars, https://photos.micro.blog/50/
			if theImg starts with "https://photos.micro.blog/" then
				try -- just in case
					set theImg to ((characters 30 through the end) of theImg) as string
				on error theErr number errNum
				end try
			end if

			-- create Micro.blog photo thumbnail URL
			set newImg to "https://photos.micro.blog/150/" & theImg as string
			-- and HTML image thumbnail and link
			set theHTML to theHTML & return & "<a href='" & thePost & "'><img src='" & newImg & "' alt='" & theAlt & "'></a>"
			set theCount to theCount + 1
		on error theErr number errNum
			if theErr is not 0 then display dialog errNum & ": " & theErr
		end try

	end repeat

	--finish HTML document skeleton
	set theHTML to theHTML & return & theHTMLend

	--adjust thumbnail size based on number of photos (& my window width)
	--using else if, the first branch that evaluates to true will be executed
	if theCount > 132 then
		set replSize to "/90/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 110 then
		set replSize to "/95/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 100 then
		set replSize to "/100/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 90 then
		set replSize to "/105/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 81 then -- 72+ is slightly cut-off, but useable
		set replSize to "/110/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 49 then
		set replSize to "/120/"
		set theHTML to my replaceThumbSize(theHTML, replSize)
	end if

	-- and load it in the browser using data: URI
	tell application _browser to open location "data:text/html;charset=utf-8," & theHTML
	-- XXX provide an option to save this to a file (where the returns will be visible)
end run

-- https://daringfireball.net/2009/01/applescripts_targetting_safari_or_webkit
on GetCurrentApp()
	tell application "System Events" to ¬
		get short name of first process whose frontmost is true
end GetCurrentApp

-- XXX is there a 64-bit-compliant version of this?
on GetDefaultWebBrowser()
	-- First line of _scpt is a workaround for Snow Leopard issues
	-- with 32-bit Mac:: Carbon modules
	set _scpt to "export VERSIONER_PERL_PREFER_32_BIT=yes; " & ¬
		"perl -MMac::InternetConfig -le " & ¬
		"'print +(GetICHelper \"http\")[1]'"
	return do shell script _scpt
end GetDefaultWebBrowser

-- truncate page title into something vaguely usable as alt text
on truncateTitle(theAlt)
	if (count words of theAlt) > 10 then set theAlt to (text (word 1) thru (word 10) of theAlt & "….") as string
	-- if page title is empty (there was no text with the post, only a photo), add a last-ditch alt string
	if theAlt = "" then set theAlt to "Image from Micro.blog; description unavailable."
	return theAlt
end truncateTitle

on replaceThumbSize(theText, theSize)
	set origText to theText -- save the original in case of error
	set theTIDs to AppleScript's text item delimiters
	set AppleScript's text item delimiters to "/150/"
	try
		set theStrings to text items of theText
		set AppleScript's text item delimiters to theSize
		set theText to theStrings as string
		set AppleScript's text item delimiters to theTIDs
	on error theErr number errNum
		set AppleScript's text item delimiters to theTIDs
		set theText to origText
	end try
	return theText
end replaceThumbSize
	(*
	* Thumbnail image from Micro.blog post
	* Creates a thumbnail from the first image present in a given Micro.blog post
	* v1.0.7
	* 2019-08-18
	* https://gist.github.com/sardisson/e50286e17c8f2f39bfec9429ed5f46cd
	*)

	on run
	-- gruber's get browser routine, with a twist to exclude non-Apple/Google browsers
	set _browser to GetCurrentApp()
	if _browser is not in {"Safari", "WebKit", "Safari Technology Preview", "Chrome", "Google Chrome"} then
	set _browser to GetDefaultWebBrowser()
	if _browser is not in {"Safari", "WebKit", "Safari Technology Preview", "Chrome", "Google Chrome"} then
	display dialog "Sorry, this script only works with Safari and Chrome" with icon stop buttons {"Cancel"} default button 1
	error number -128
	end if
	end if

	-- set up the HTML skeleton in variables
	set theHTML to "<html><head><title>Photos from Micro.blog</title><meta charset='utf-8'></head><body>"
	set theHTMLend to "</body></html>"

	-- takes a return-delimited list of URLs
	set inputText to text returned of (display dialog "Please enter a return-delimited list of URLs of Micro.blog posts to thumbnail" default answer "" with icon note) as string
	if inputText is "" then error number -128

	-- split the input into a List by paragraph
	-- this should never fail, because it's either "" (handled above) or at least 1 paragraph
	set postList to every paragraph of inputText

	-- used to adjust thumbnail size to fit in a window based on final count of images
	-- ideally the thumbnail size would be set correctly beforehand, but since it is written into the HTML as part of processing each image, and bogus URLs are filtered out as part of the image-processing loop, increment a count each time we successfully write an image-thumbnail HTML snippet
	set theCount to 0

	-- handle each post URL
	repeat with thePost in postList
	try -- using a try block to fake "continue" -- https://stackoverflow.com/a/6007211
	-- skip any non-Micro.blog URLs in the list
	if thePost does not start with "https://micro.blog/" then error 0
	--log thePost

	-- get the id of the Micro.blog post to construct the HTML div id to look in for the image
	set theTIDs to AppleScript's text item delimiters
	set AppleScript's text item delimiters to "/"
	try
	set thePostID to last text item of thePost as string
	set AppleScript's text item delimiters to theTIDs
	on error theErr number errNum
	set AppleScript's text item delimiters to theTIDs
	error number -128
	end try

	-- set up the commands for do JavaScript to get the img src and alt
	-- set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[0].src"
	set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].querySelector('img:not(.wp-smiley)').src;"
	-- XXX ideally filter out WP-Emoji images and retry with img[1].src and img[1].alt -- see below for variants
	-- XXX probably set up a small loop for jsImgSrcCommand/jsImgAltCommand that goes from 0 to 5, get node[n], checks for NOT the WP Emoji URL and exits, else tries again.
	-- but the problem is that the commands are run later, by each browser; this just defines the commands that are run, using different syntax, by each browser. not sure how to refactor
	-- maybe have this as a function, which internally loops node[n] until it gets a non-empty, non-WP emoji result (and then fetches the alt), and each browser call it, with loop-internal separation for browser types? Why did Google have to fork 'do JavaScript'?!
	(**
	repeat with n from 0 to 5
	set jsImgSrcCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[" & n & "].src"
	set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[" & n & "].alt"
	-- if the first value returned does not contain WP Emoji, then it’s what we want and we can skip checking the others
	if jsImgSrcCommand does not contain "https://s.w.org/images/core/emoji/" then exit repeat
	-- https://s.w.org/images/core/emoji/12.0.0-1/72x72/1f4f8.png for self-hosted
	-- https://s0.wp.com/wp-content/mu-plugins/wpcom-smileys/twemoji/2/72x72/1f490.png for wp.com
	end repeat
	**)
	-- set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].getElementsByTagName('img')[0].alt"
	set jsImgAltCommand to "document.getElementById('post_div_" & thePostID & "').getElementsByClassName('post_content')[0].querySelector('img:not(.wp-smiley)').getAttribute('alt');"
	-- if alt is empty, grab the post text/title in order to have a fallback in at least some cases; long strings truncated later
	set jsTitleCommand to "document.getElementsByTagName('title')[0].textContent.trim()"

	-- XXX https://micro.blog/bzz/3537228 fails to fetch image for some reason (but fetches title-as-caption fine)

	-- fetch image URL from Google Chrome
	if _browser is in {"Chrome", "Google Chrome"} then
	tell application "Google Chrome"
	open location thePost
	-- https://apple.stackexchange.com/q/343624
	repeat until (loading of active tab of front window is false)
	delay 0.25
	end repeat
	tell (active tab of front window) to set theImg to execute javascript jsImgSrcCommand
	tell (active tab of front window) to set theAlt to execute javascript jsImgAltCommand
	-- filter out bogus alt and set to blank to force fetching post content
	-- if theAlt is in {"mp-photo-alt[]=", "mp-photo-alt[]=mp-photo-alt[]="} then set theAlt to ""
	if ((theAlt starts with "mp-photo-alt[]=") or (theAlt is missing value)) then set theAlt to ""
	-- if there's no alt, make one up from the post content (as reflected in the page title)
	if theAlt = "" then tell (active tab of front window) to set theAlt to my truncateTitle(execute javascript jsTitleCommand)
	close active tab of front window -- try guard?
	end tell
	end if
	-- end Google Chrome

	-- fetch image URL from Apple's WebKit browsers
	if _browser is in {"Safari", "WebKit", "Safari Technology Preview"} then
	using terms from application "Safari"
	tell application _browser
	open location thePost
	tell current tab of front window
	-- this is an ugly hack, but the standard "until there's document source" method seems crash-happy here, and we really want the DOM+specific src anyway, so may as well wait for exactly that
	-- XXX this fails in some versions of Safari :-(
	-- XXX this either needs a timeout or to look for some other DOM element, because pages with no images just sit forever: https://micro.blog/numericcitizen/4141716
	repeat until (do JavaScript jsImgSrcCommand) is not missing value -- or ≠ ?
	delay 0.25 -- allow a bit of time to breathe
	end repeat
	-- ideally it would be possible to repeat until we got the src and then assign the result, but the repeat eats the result, so get it again
	set theImg to do JavaScript jsImgSrcCommand
	set theAlt to do JavaScript jsImgAltCommand
	-- filter out bogus alt and set to blank to force fetching post content
	-- if theAlt is in {"mp-photo-alt[]=", "mp-photo-alt[]=mp-photo-alt[]="} then set theAlt to ""
	if ((theAlt starts with "mp-photo-alt[]=") or (theAlt is missing value)) then set theAlt to ""
	-- if there's no alt, make one up from the post content (as reflected in the page title)
	if theAlt = "" then set theAlt to my truncateTitle(do JavaScript jsTitleCommand)
	end tell
	close current tab of front window -- try guard?
	end tell
	end using terms from
	end if
	--end Apple's WebKit browsers

	-- strip any previous photos.m.b url segment (inline thumbs)
	-- so far always 29 chars, https://photos.micro.blog/50/
	if theImg starts with "https://photos.micro.blog/" then
	try -- just in case
	set theImg to ((characters 30 through the end) of theImg) as string
	on error theErr number errNum
	end try
	end if

	-- create Micro.blog photo thumbnail URL
	set newImg to "https://photos.micro.blog/150/" & theImg as string
	-- and HTML image thumbnail and link
	set theHTML to theHTML & return & "<a href='" & thePost & "'><img src='" & newImg & "' alt='" & theAlt & "'></a>"
	set theCount to theCount + 1
	on error theErr number errNum
	if theErr is not 0 then display dialog errNum & ": " & theErr
	end try

	end repeat

	--finish HTML document skeleton
	set theHTML to theHTML & return & theHTMLend

	--adjust thumbnail size based on number of photos (& my window width)
	--using else if, the first branch that evaluates to true will be executed
	if theCount > 132 then
	set replSize to "/90/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 110 then
	set replSize to "/95/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 100 then
	set replSize to "/100/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 90 then
	set replSize to "/105/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 81 then -- 72+ is slightly cut-off, but useable
	set replSize to "/110/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	else if theCount > 49 then
	set replSize to "/120/"
	set theHTML to my replaceThumbSize(theHTML, replSize)
	end if

	-- and load it in the browser using data: URI
	tell application _browser to open location "data:text/html;charset=utf-8," & theHTML
	-- XXX provide an option to save this to a file (where the returns will be visible)
	end run

	-- https://daringfireball.net/2009/01/applescripts_targetting_safari_or_webkit
	on GetCurrentApp()
	tell application "System Events" to ¬
	get short name of first process whose frontmost is true
	end GetCurrentApp

	-- XXX is there a 64-bit-compliant version of this?
	on GetDefaultWebBrowser()
	-- First line of _scpt is a workaround for Snow Leopard issues
	-- with 32-bit Mac:: Carbon modules
	set _scpt to "export VERSIONER_PERL_PREFER_32_BIT=yes; " & ¬
	"perl -MMac::InternetConfig -le " & ¬
	"'print +(GetICHelper \"http\")[1]'"
	return do shell script _scpt
	end GetDefaultWebBrowser

	-- truncate page title into something vaguely usable as alt text
	on truncateTitle(theAlt)
	if (count words of theAlt) > 10 then set theAlt to (text (word 1) thru (word 10) of theAlt & "….") as string
	-- if page title is empty (there was no text with the post, only a photo), add a last-ditch alt string
	if theAlt = "" then set theAlt to "Image from Micro.blog; description unavailable."
	return theAlt
	end truncateTitle

	on replaceThumbSize(theText, theSize)
	set origText to theText -- save the original in case of error
	set theTIDs to AppleScript's text item delimiters
	set AppleScript's text item delimiters to "/150/"
	try
	set theStrings to text items of theText
	set AppleScript's text item delimiters to theSize
	set theText to theStrings as string
	set AppleScript's text item delimiters to theTIDs
	on error theErr number errNum
	set AppleScript's text item delimiters to theTIDs
	set theText to origText
	end try
	return theText
	end replaceThumbSize