Instantly share code, notes, and snippets.
Created
July 12, 2012 05:12
-
Star
(1)
1
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save rsvp/3095975 to your computer and use it in GitHub Desktop.
tagscrape.sh : scrape content between given html tag. | Linux bash script.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# bash 4.1.5(1) Linux Ubuntu 10.04 Date : 2012-07-11 | |
# | |
# _______________| tagscrape : echo content(s) between html tag pair. | |
# | |
# Usage: tagscrape [tag] [file/URL] | |
# # Output will exclude the tag themselves. | |
# # Default for file is stdin, so pipe will work. | |
# | |
# Examples: % tagscrape pre foo.html | |
# # Prints between <pre> and </pre>, | |
# # contents delimited by "@_@" on newline. | |
# % tagscape b http://bar.com/foo | keywords | |
# # Pass bold <b> contents to find keywords. | |
# | |
# Dependencies: sed, awk | |
# curl | |
# CHANGE LOG LATEST version available: https://bitbucket.org/rsvp/gists/src | |
# 2012-07-11 Add URL option for temporary download. | |
# 2012-07-10 Enforce single tag per line. | |
# 2012-07-09 First version. | |
# _____ PREAMBLE_v2: settings, variables, and error handling. | |
# | |
LC_ALL=POSIX | |
# locale means "ASCII, US English, no special rules, | |
# output per ISO and RFC standards." | |
# Esp. use ASCII encoding for glob and sorting characters. | |
shopt -s extglob | |
# ^set extended glob for pattern matching. | |
shopt -s failglob | |
# ^failed pattern matching signals error. | |
set -e | |
# ^errors checked: immediate exit if a command has non-zero status. | |
set -u | |
# ^unassigned variables shall be errors. | |
# Example of default VARIABLE ASSIGNMENT: arg1=${1:-'foo'} | |
tag=${1:-'pre'} | |
# tag assumes <tag> and </tag> pairing. | |
file=${2:-'-'} | |
program=${0##*/} # similar to using basename | |
memf=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX ) | |
mem2=$( mktemp /dev/shm/88_${program}_tmp.XXXXXXXXXX ) | |
cleanup () { | |
# Delete temporary files, then optionally exit given status. | |
local status=${1:-'0'} | |
rm -f $memf $mem2 | |
[ $status = '-1' ] || exit $status # thus -1 prevents exit. | |
} #-------------------------------------------------------------------- | |
warn () { | |
# Message with basename to stderr. Usage: warn "message" | |
echo -e "\n !! ${program}: $1 " >&2 | |
} #-------------------------------------------------------------------- | |
die () { | |
# Exit with status of most recent command or custom status, after | |
# cleanup and warn. Usage: command || die "message" [status] | |
local status=${2:-"$?"} | |
cleanup -1 && warn "$1" && exit $status | |
} #-------------------------------------------------------------------- | |
trap "die 'SIG disruption, but cleanup finished.' 114" 1 2 3 15 | |
# Cleanup after INTERRUPT: 1=SIGHUP, 2=SIGINT, 3=SIGQUIT, 15=SIGTERM | |
# | |
# _______________ :: BEGIN Script :::::::::::::::::::::::::::::::::::::::: | |
# Given an URL as argument, download it as a temporary file | |
# (assuming that it is html source code): | |
if [ "${file:0:4}" = 'http' ] ; then | |
curl -s -L "$file" > $mem2 | |
file=$mem2 | |
fi | |
# First sed enforces single tag per line, because they may be | |
# in the midst of other tags on a long line. | |
# A @@@ marker is placed at the end of content between tag. | |
# This procedure may introduce blank lines in the content | |
# if in fact the tags are already one per line. [*side-effect] | |
# Second sed prints the content with tag pairing. | |
# Option -n prevents duplicate lines from p print operation. | |
# ALL qualified tag pairings are printed. | |
# Third sed deletes the tags to show only the content. | |
sed -e "s/<$tag>/\n<$tag>\n/g" \ | |
-e "s/<\/$tag>/@@@\n<\/$tag>\n/g" "$file" \ | |
| sed -n -e "/<$tag>/,/<\/$tag>/p" \ | |
| sed -e "/<$tag>/d" -e "/<\/$tag>/d" > $memf | |
[ -s $memf ] || die "got nothing. Try capitalizing tag: $tag" 113 | |
# awk will correct *side-effect noted above... | |
# @@@ is useful as the record separator RS, | |
# \n will be the field separator FS, | |
# and NF is the number of fields. | |
# | |
{ awk -f - $memf <<EOHereDoc | |
BEGIN { FS = "\n" ; RS = "@@@\n" } | |
{ for (i=1; i<=NF; i++) | |
{ if ( ! ((i == 1 && \$i == "") || (i == NF && \$i == "")) ) | |
print \$i | |
} | |
{ print "@_@" } | |
} | |
EOHereDoc | |
} > $mem2 | |
# Thus @_@ becomes the new RECORD SEPARATOR between contents. | |
# awk's logic looks over-engineered, but it's very readable | |
# compared to an equivalent operation in sed :-) | |
cat $mem2 | |
# # # WE ARE DONE -- but we shall include the slippery code below as | |
# # # =========== REFERENCE to make any FURTHER TRANSFORMATIONS: | |
# # | |
# { awk -f - $mem2 <<EOHereDoc | |
# BEGIN { FS = "\n" ; RS = "@_@\n" } | |
# # Not i<=NF -- this is the tricky part. | |
# { for (i=1; i<NF; i++) | |
# { if ( 0 == 0 ) | |
# print \$i | |
# } | |
# { print "__@__" } | |
# } | |
# EOHereDoc | |
# } | |
# # Thus __@__ becomes the new RECORD SEPARATOR between contents. | |
cleanup | |
# _______________ EOS :: END of Script :::::::::::::::::::::::::::::::::::::::: | |
# vim: set fileencoding=utf-8 ff=unix tw=78 ai syn=sh : |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There is a spelling error on line 13, which should be tagscrape instead of tagscape.
However, an example script. Thank you for your work!