doubleirish/aggregateScrapesFromWebCrawl.sh

## aggregateScrapesFromWebCrawl.sh
export OS_USERNAME="yourUser"
export OS_PASSWORD="yourPass"
# logs into protected page (your formdata params may be different), finds relative urls, converts them to full urls and stores them in a local file
wget -O- --post-data  "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}"  https://www.example.com/display/IT/Release+Management \
  | grep href \
  | sed -n 's/.*href="\([^"]*\).*/https\:\/\/www\.example\.com\1/p' \
  | sort | uniq > urls.txt

# iterates through file of urls and aggrregates screen scrapes of selected xpath elements in each url into a single summary files
echo  "<html><head><title>summary</title></head><body>" >summary.html;
while read url; do
    echo  "parsing $url ...";
    echo  "<h2> $url </h2> <table border='1'>  " >>summary.html;
    # download each url, auto logs into protected sourc
    wget -O- --post-data "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}" $url > page.html ;
    # uses xpath on xmllint to extract a target element e.g the contents of a table after a specific h2 entry
    xmllint --html --htmlout --recover --nowarning --format --xpath "//h2[text()='Production Issues Encountered']/following::table/node()"  page.html  2>/dev/null >>summary.html;
    echo  "</table>  " >>summary.html;
done < urls.txt

echo  "</body></html>" >>summary.html;
sed -i 's/[^[:print:]]//g'  summary.html; # remove unprintables
	export OS_USERNAME="yourUser"
	export OS_PASSWORD="yourPass"
	# logs into protected page (your formdata params may be different), finds relative urls, converts them to full urls and stores them in a local file
	wget -O- --post-data "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}" https://www.example.com/display/IT/Release+Management \
	\| grep href \
	\| sed -n 's/.href="\([^"]\).*/https\:\/\/www\.example\.com\1/p' \
	\| sort \| uniq > urls.txt

	# iterates through file of urls and aggrregates screen scrapes of selected xpath elements in each url into a single summary files
	echo "<html><head><title>summary</title></head><body>" >summary.html;
	while read url; do
	echo "parsing $url ...";
	echo "<h2> $url </h2> <table border='1'> " >>summary.html;
	# download each url, auto logs into protected sourc
	wget -O- --post-data "os_username=${OS_USERNAME}&os_password=${OS_PASSWORD}" $url > page.html ;
	# uses xpath on xmllint to extract a target element e.g the contents of a table after a specific h2 entry
	xmllint --html --htmlout --recover --nowarning --format --xpath "//h2[text()='Production Issues Encountered']/following::table/node()" page.html 2>/dev/null >>summary.html;
	echo "</table> " >>summary.html;
	done < urls.txt

	echo "</body></html>" >>summary.html;
	sed -i 's/[^[:print:]]//g' summary.html; # remove unprintables