JustAnotherArchivist/tumblr-monitor.md Secret

## tumblr-monitor.md

      
    Raw
  

              tumblr-monitor.md
            
          
    Small script to monitor the currently running Tumblr items. Prints the following for each item:

Process ID
Memory usage (RSS)
Size of the WARC file
RSS divided by WARC size (a measure for the "memory intensity" of a job)
Time elapsed since the item started
Number of posts that have been retrieved and the total number of posts on the blog (according to Tumblr's API)
Percentage of posts retrieved
Number of posts whose notes have been retrieved recently

Uses a cache file to store the Tumblr API responses so it doesn't need to ask for the total post count every time. By default, the cache file is the name of the script plus ".cache" appended, but you can also specify another path by passing it as an argument to the script.
Dependencies: ps, grep, sed, awk, curl, du, cut, date. I can't tell you exactly which implementations are supported. Developed using progps-ng 3.3.9, GNU grep 2.20, GNU sed 4.2.2, mawk 1.3.3, curl 7.38.0, GNU coreutils du/cut/date 8.23.
Example output:
> ./tumblr-monitor
ITEM                            PID    RSS         WARC         RSS/WARC*1000  TIME        POSTS        POSTS%  NOTEPOSTS
tumblr-blog:cadaverscorpse      17288  525127680   6214921554   84.49          0-21:07:58  5321/5677    93.73   224
tumblr-blog:lostcybertronian    17290  579108864   8792712492   65.86          0-21:07:58  9838/81481   12.07   52
tumblr-blog:shroudedexcitement  17293  409849856   2139065756   191.60         0-21:07:56  5936/6091    97.46   525
tumblr-blog:asianmansex         17304  516493312   10359033114  49.86          0-21:07:38  6842/8051    84.98   4
tumblr-blog:yorkpud             20353  1317564416  36079853345  36.52          1-15:53:33  16019/21060  76.06   4
tumblr-blog:cheshirepussy       20363  263147520   3849364523   68.36          1-15:53:30  6053/6051    100.03  76
tumblr-blog:northeastladschavs  21324  133783552   4298256072   31.13          0-19:53:36  5807/5817    99.83   779
tumblr-blog:drawbauchery        22415  577601536   2801735802   206.16         0-19:35:11  9684/12445   77.81   115
tumblr-blog:palm-wines          29153  1207320576  9878352301   122.22         1-13:13:49  13582/45985  29.54   203

If you want the table to be sorted, pipe it to sort. For example, ./tumblr-monitor | sort -k 4,4n sorts by the WARC size (the fourth column).

  
## tumblr-monitor.sh
#!/bin/bash
set -e

## Usage: tumblr-monitor [CACHEFILE]
# The cache file is used for storing the API responses so we don't hammer the Tumblr API.

if [[ $# -eq 1 ]]
then
	cachefile="$1"
elif [[ $# -eq 0 ]]
then
	cachefile="$(readlink -f "${0}.cache")"
else
	echo "Invalid arguments"
	exit 1
fi


# Ensure that the cache file exists
if [[ ! -e "${cachefile}" ]]
then
	echo "# Cache file of API responses for tumblr-monitor" > "${cachefile}"
fi


{
	echo "ITEM PID RSS WARC RSS/WARC*1000 TIME POSTS POSTS% NOTEPOSTS";
	ps -C wget-lua --format 'pid,rss,etime,cmd' --no-headers |
		grep 'tumblr-blog' |
		sed 's,^\s*,,; s,\./wget-lua.* -o \([^ ]\+\) .*--warc-file \([^ ]\+\) .*--warc-header tumblr-blog: \(tumblr-blog:[^ ]\+\).*$,\1 \2.warc.gz \3,' |
		awk \
		 '
		  FILENAME != "-" {
			# First line is a comment to make sure that awk always learns the filename; there is definitely a more elegant way than this...
			# $1 = blog name, $2 = post count per API
			cachefilename = FILENAME
			if ($1 != "#")
				postcounts[$1] = $2
		  }

		  FILENAME == "-" {
			pid = $1
			rss = $2 * 1024
			etime = $3
			logfile = $4
			warcfile = $5
			item = $6

			blogname = substr(item, index(item, ":") + 1)
			if (index(etime, "-") == 0)
				etime = "0-" etime

			# Get size of WARC file
			cmd = "du -b " warcfile " | cut -f1"
			cmd | getline warcsize

			# Request post count from Tumblr API if necessary
			if (!(blogname in postcounts)) {
				cmd = "curl -s -A \"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html; ArchiveTeam)\" \"https://api.tumblr.com/v2/blog/" blogname ".tumblr.com/info?api_key=BUHsuO5U9DF42uJtc8QTZlOmnUaJmBJGuU1efURxeklbdiLn9L\" | grep -Po \"\\\"posts\\\":\\K\\d+\""
				cmd | getline cnt
				postcounts[blogname] = cnt
				print blogname " " cnt >> cachefilename
			}

			# Get retrieved post count
			cmd = "grep -Po \"/post/\\d+\" " logfile " | awk \"!seen[\\$0]++{n++} END{print n}\"" # Do you even awk?
			cmd | getline postsretrieved

			# Get the notes of how many posts have been retrieved recently
			cmd = "tail -n 1000 " logfile " | grep -A1000 \"^$(date -d \"-5 minutes\" \"+%Y-%m-%d %H:%M\")\" | grep -Po \"/notes/\\d+/\" | awk \"!seen[\\$0]++{n++} END{print n}\""
			cmd | getline noteposts

			# Calculate fractions
			if (warcsize > 0) rsswarc = sprintf("%.2f", 1000 * rss / warcsize); else rsswarc = "NaN"
			if (postcounts[blogname] > 0) postfrac = sprintf("%.2f", 100 * postsretrieved / postcounts[blogname]); else postfrac = "NaN"

			# Print; need to use %.0f for RSS and WARC because the values may be larger than both %d and %u. Double precision can accurately represent integers up to 2^53, so that is not a problem (for now...).
			printf "%s %d %.0f %.0f %s %s %d/%d %s %d\n", item, pid, rss, warcsize, rsswarc, etime, postsretrieved, postcounts[blogname], postfrac, noteposts
		  }
		 ' "${cachefile}" -
} | column -t
	#!/bin/bash
	set -e

	## Usage: tumblr-monitor [CACHEFILE]
	# The cache file is used for storing the API responses so we don't hammer the Tumblr API.

	if [[ $# -eq 1 ]]
	then
	cachefile="$1"
	elif [[ $# -eq 0 ]]
	then
	cachefile="$(readlink -f "${0}.cache")"
	else
	echo "Invalid arguments"
	exit 1
	fi


	# Ensure that the cache file exists
	if [[ ! -e "${cachefile}" ]]
	then
	echo "# Cache file of API responses for tumblr-monitor" > "${cachefile}"
	fi


	{
	echo "ITEM PID RSS WARC RSS/WARC*1000 TIME POSTS POSTS% NOTEPOSTS";
	ps -C wget-lua --format 'pid,rss,etime,cmd' --no-headers \|
	grep 'tumblr-blog' \|
	sed 's,^\s,,; s,\./wget-lua. -o \([^ ]\+\) .--warc-file \([^ ]\+\) .--warc-header tumblr-blog: \(tumblr-blog:[^ ]\+\).*$,\1 \2.warc.gz \3,' \|
	awk \
	'
	FILENAME != "-" {
	# First line is a comment to make sure that awk always learns the filename; there is definitely a more elegant way than this...
	# $1 = blog name, $2 = post count per API
	cachefilename = FILENAME
	if ($1 != "#")
	postcounts[$1] = $2
	}

	FILENAME == "-" {
	pid = $1
	rss = $2 * 1024
	etime = $3
	logfile = $4
	warcfile = $5
	item = $6

	blogname = substr(item, index(item, ":") + 1)
	if (index(etime, "-") == 0)
	etime = "0-" etime

	# Get size of WARC file
	cmd = "du -b " warcfile " \| cut -f1"
	cmd \| getline warcsize

	# Request post count from Tumblr API if necessary
	if (!(blogname in postcounts)) {
	cmd = "curl -s -A \"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html; ArchiveTeam)\" \"https://api.tumblr.com/v2/blog/" blogname ".tumblr.com/info?api_key=BUHsuO5U9DF42uJtc8QTZlOmnUaJmBJGuU1efURxeklbdiLn9L\" \| grep -Po \"\\\"posts\\\":\\K\\d+\""
	cmd \| getline cnt
	postcounts[blogname] = cnt
	print blogname " " cnt >> cachefilename
	}

	# Get retrieved post count
	cmd = "grep -Po \"/post/\\d+\" " logfile " \| awk \"!seen[\\$0]++{n++} END{print n}\"" # Do you even awk?
	cmd \| getline postsretrieved

	# Get the notes of how many posts have been retrieved recently
	cmd = "tail -n 1000 " logfile " \| grep -A1000 \"^$(date -d \"-5 minutes\" \"+%Y-%m-%d %H:%M\")\" \| grep -Po \"/notes/\\d+/\" \| awk \"!seen[\\$0]++{n++} END{print n}\""
	cmd \| getline noteposts

	# Calculate fractions
	if (warcsize > 0) rsswarc = sprintf("%.2f", 1000 * rss / warcsize); else rsswarc = "NaN"
	if (postcounts[blogname] > 0) postfrac = sprintf("%.2f", 100 * postsretrieved / postcounts[blogname]); else postfrac = "NaN"

	# Print; need to use %.0f for RSS and WARC because the values may be larger than both %d and %u. Double precision can accurately represent integers up to 2^53, so that is not a problem (for now...).
	printf "%s %d %.0f %.0f %s %s %d/%d %s %d\n", item, pid, rss, warcsize, rsswarc, etime, postsretrieved, postcounts[blogname], postfrac, noteposts
	}
	' "${cachefile}" -
	} \| column -t