rsvp/tagscrape.sh

## tagscrape.sh
#!/usr/bin/env bash
#              bash 4.1.5(1)     Linux Ubuntu 10.04           Date : 2012-07-11
#
# _______________|  tagscrape : echo content(s) between html tag pair.
#
#           Usage:  tagscrape  [tag] [file/URL]
#                   #  Output will exclude the tag themselves.
#                   #  Default for file is stdin, so pipe will work.
#
#        Examples:  % tagscrape pre foo.html
#                   #    Prints between <pre> and </pre>,
#                   #    contents delimited by "@_@" on newline.
#                   % tagscape b http://bar.com/foo | keywords
#                   #    Pass bold <b> contents to find keywords.
#
#    Dependencies:  sed, awk
#                   curl


#  CHANGE LOG  LATEST version available:   https://bitbucket.org/rsvp/gists/src
#  2012-07-11  Add URL option for temporary download.
#  2012-07-10  Enforce single tag per line.
#  2012-07-09  First version.


#           _____ PREAMBLE_v2: settings, variables, and error handling.
#
LC_ALL=POSIX
#      locale means "ASCII, US English, no special rules,
#      output per ISO and RFC standards."
#      Esp. use ASCII encoding for glob and sorting characters.
shopt -s   extglob
#     ^set extended glob for pattern matching.
shopt -s   failglob
#         ^failed pattern matching signals error.
set -e
#   ^errors checked: immediate exit if a command has non-zero status.
set -u
#   ^unassigned variables shall be errors.
#    Example of default VARIABLE ASSIGNMENT:  arg1=${1:-'foo'}

tag=${1:-'pre'}
#         tag assumes <tag> and </tag> pairing.
file=${2:-'-'}


program=${0##*/}   #  similar to using basename
memf=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX )
mem2=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX )


cleanup () {
     #  Delete temporary files, then optionally exit given status.
     local status=${1:-'0'}
     rm -f $memf $mem2
     [ $status = '-1' ] ||  exit $status      #  thus -1 prevents exit.
} #--------------------------------------------------------------------
warn () {
     #  Message with basename to stderr.          Usage: warn "message"
     echo -e "\n !!  ${program}: $1 "  >&2
} #--------------------------------------------------------------------
die () {
     #  Exit with status of most recent command or custom status, after
     #  cleanup and warn.      Usage: command || die "message" [status]
     local status=${2:-"$?"}
     cleanup -1  &&   warn "$1"  &&  exit $status
} #--------------------------------------------------------------------
trap "die 'SIG disruption, but cleanup finished.' 114" 1 2 3 15
#    Cleanup after INTERRUPT: 1=SIGHUP, 2=SIGINT, 3=SIGQUIT, 15=SIGTERM
#
# _______________     ::  BEGIN  Script ::::::::::::::::::::::::::::::::::::::::


#  Given an URL as argument, download it as a temporary file
#  (assuming that it is html source code):
if [ "${file:0:4}" = 'http' ] ; then
     curl -s -L "$file" > $mem2
     file=$mem2
fi


#  First sed enforces single tag per line, because they may be
#       in the midst of other tags on a long line.
#       A @@@ marker is placed at the end of content between tag.
#       This procedure may introduce blank lines in the content
#       if in fact the tags are already one per line. [*side-effect]
#  Second sed prints the content with tag pairing.
#       Option -n prevents duplicate lines from p print operation.
#       ALL qualified tag pairings are printed.
#  Third sed deletes the tags to show only the content.

sed -e "s/<$tag>/\n<$tag>\n/g"                            \
    -e "s/<\/$tag>/@@@\n<\/$tag>\n/g"           "$file"   \
     |  sed -n -e "/<$tag>/,/<\/$tag>/p"                  \
     |  sed    -e "/<$tag>/d" -e "/<\/$tag>/d" > $memf


[ -s $memf ]  ||  die "got nothing. Try capitalizing tag: $tag" 113


#  awk will correct *side-effect noted above...
#      @@@ is useful as the record separator RS,
#      \n  will be the field separator FS,
#      and NF is the number of fields.
#
{ awk -f - $memf <<EOHereDoc
     BEGIN { FS = "\n" ; RS = "@@@\n" }
     { for (i=1; i<=NF; i++)
          { if ( ! ((i == 1 && \$i == "") || (i == NF && \$i == "")) )
               print \$i
          }
          { print "@_@"  }
     }
EOHereDoc
} > $mem2
#             Thus @_@ becomes the new RECORD SEPARATOR between contents.

#   awk's logic looks over-engineered, but it's very readable
#               compared to an equivalent operation in sed     :-)


cat $mem2
#  #  #   WE ARE DONE -- but we shall include the slippery code below as
#  #  #   ===========    REFERENCE to make any FURTHER TRANSFORMATIONS:
#  #
#  { awk -f - $mem2 <<EOHereDoc
#       BEGIN { FS = "\n" ; RS = "@_@\n" }
#       #       Not i<=NF -- this is the tricky part.
#       { for (i=1; i<NF; i++)
#            { if ( 0 == 0 )
#                 print \$i
#            }
#            { print "__@__"  }
#       }
#  EOHereDoc
#  }
#  #             Thus __@__ becomes the new RECORD SEPARATOR between contents.


cleanup
# _______________ EOS ::  END of Script ::::::::::::::::::::::::::::::::::::::::

#  vim: set fileencoding=utf-8 ff=unix tw=78 ai syn=sh :
	#!/usr/bin/env bash
	# bash 4.1.5(1) Linux Ubuntu 10.04 Date : 2012-07-11
	#
	# _______________\| tagscrape : echo content(s) between html tag pair.
	#
	# Usage: tagscrape [tag] [file/URL]
	# # Output will exclude the tag themselves.
	# # Default for file is stdin, so pipe will work.
	#
	# Examples: % tagscrape pre foo.html
	# # Prints between <pre> and </pre>,
	# # contents delimited by "@_@" on newline.
	# % tagscape b http://bar.com/foo \| keywords
	# # Pass bold <b> contents to find keywords.
	#
	# Dependencies: sed, awk
	# curl


	# CHANGE LOG LATEST version available: https://bitbucket.org/rsvp/gists/src
	# 2012-07-11 Add URL option for temporary download.
	# 2012-07-10 Enforce single tag per line.
	# 2012-07-09 First version.


	# _____ PREAMBLE_v2: settings, variables, and error handling.
	#
	LC_ALL=POSIX
	# locale means "ASCII, US English, no special rules,
	# output per ISO and RFC standards."
	# Esp. use ASCII encoding for glob and sorting characters.
	shopt -s extglob
	# ^set extended glob for pattern matching.
	shopt -s failglob
	# ^failed pattern matching signals error.
	set -e
	# ^errors checked: immediate exit if a command has non-zero status.
	set -u
	# ^unassigned variables shall be errors.
	# Example of default VARIABLE ASSIGNMENT: arg1=${1:-'foo'}

	tag=${1:-'pre'}
	# tag assumes <tag> and </tag> pairing.
	file=${2:-'-'}


	program=${0##*/} # similar to using basename
	memf=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX )
	mem2=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX )


	cleanup () {
	# Delete temporary files, then optionally exit given status.
	local status=${1:-'0'}
	rm -f $memf $mem2
	[ $status = '-1' ] \|\| exit $status # thus -1 prevents exit.
	} #--------------------------------------------------------------------
	warn () {
	# Message with basename to stderr. Usage: warn "message"
	echo -e "\n !! ${program}: $1 " >&2
	} #--------------------------------------------------------------------
	die () {
	# Exit with status of most recent command or custom status, after
	# cleanup and warn. Usage: command \|\| die "message" [status]
	local status=${2:-"$?"}
	cleanup -1 && warn "$1" && exit $status
	} #--------------------------------------------------------------------
	trap "die 'SIG disruption, but cleanup finished.' 114" 1 2 3 15
	# Cleanup after INTERRUPT: 1=SIGHUP, 2=SIGINT, 3=SIGQUIT, 15=SIGTERM
	#
	# _______________ :: BEGIN Script ::::::::::::::::::::::::::::::::::::::::


	# Given an URL as argument, download it as a temporary file
	# (assuming that it is html source code):
	if [ "${file:0:4}" = 'http' ] ; then
	curl -s -L "$file" > $mem2
	file=$mem2
	fi


	# First sed enforces single tag per line, because they may be
	# in the midst of other tags on a long line.
	# A @@@ marker is placed at the end of content between tag.
	# This procedure may introduce blank lines in the content
	# if in fact the tags are already one per line. [*side-effect]
	# Second sed prints the content with tag pairing.
	# Option -n prevents duplicate lines from p print operation.
	# ALL qualified tag pairings are printed.
	# Third sed deletes the tags to show only the content.

	sed -e "s/<$tag>/\n<$tag>\n/g" \
	-e "s/<\/$tag>/@@@\n<\/$tag>\n/g" "$file" \
	\| sed -n -e "/<$tag>/,/<\/$tag>/p" \
	\| sed -e "/<$tag>/d" -e "/<\/$tag>/d" > $memf


	[ -s $memf ] \|\| die "got nothing. Try capitalizing tag: $tag" 113


	# awk will correct *side-effect noted above...
	# @@@ is useful as the record separator RS,
	# \n will be the field separator FS,
	# and NF is the number of fields.
	#
	{ awk -f - $memf <<EOHereDoc
	BEGIN { FS = "\n" ; RS = "@@@\n" }
	{ for (i=1; i<=NF; i++)
	{ if ( ! ((i == 1 && \$i == "") \|\| (i == NF && \$i == "")) )
	print \$i
	}
	{ print "@_@" }
	}
	EOHereDoc
	} > $mem2
	# Thus @_@ becomes the new RECORD SEPARATOR between contents.

	# awk's logic looks over-engineered, but it's very readable
	# compared to an equivalent operation in sed :-)



	cat $mem2
	# # # WE ARE DONE -- but we shall include the slippery code below as
	# # # =========== REFERENCE to make any FURTHER TRANSFORMATIONS:
	# #
	# { awk -f - $mem2 <<EOHereDoc
	# BEGIN { FS = "\n" ; RS = "@_@\n" }
	# # Not i<=NF -- this is the tricky part.
	# { for (i=1; i<NF; i++)
	# { if ( 0 == 0 )
	# print \$i
	# }
	# { print "__@__" }
	# }
	# EOHereDoc
	# }
	# # Thus __@__ becomes the new RECORD SEPARATOR between contents.



	cleanup
	# _______________ EOS :: END of Script ::::::::::::::::::::::::::::::::::::::::

	# vim: set fileencoding=utf-8 ff=unix tw=78 ai syn=sh :