-
-
Save benlancaster/f563a0a1cfa64c86476a to your computer and use it in GitHub Desktop.
Scrape and sanitise allaboutcircuits.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
wget \ | |
--reject=ogv,mp4,pdf \ | |
--exclude-domains forum.allaboutcircuits.com \ | |
--domains=sub.allaboutcircuits.com,allaboutcircuits.com \ | |
--recursive \ | |
--span-hosts \ | |
--level=0 \ | |
--convert-links \ | |
--page-requisites \ | |
--execute=robots=off \ | |
--no-verbose \ | |
--no-use-server-timestamps \ | |
--exclude-directories=videos,worksheets \ | |
--no-remove-listing \ | |
http://www.allaboutcircuits.com/ | |
find . -type f -name '*.html' -print0 | while IFS= read -r -d '' file; | |
do | |
gsed -i 's/<The/The/g' $file | |
gsed -i 's/<DELTA>/\Δ/g' $file | |
gsed -i 's/<SIGMA>/\Σ/g' $file | |
gsed -i 's/<PI>/\Π/g' $file | |
gsed -i 's/<sp>//g' $file | |
gsed -i 's/<\/sp>//g' $file | |
gsed -i 's/<plusminus)>/±/g' $file | |
gsed -i 's/<superscript>/<sup>/g' $file | |
gsed -i 's/<\/superscript>/<sup>/g' $file | |
gsed -i 's/<italic>/<i>/g' $file | |
gsed -i 's/<\/italic>/<\/i>/g' $file | |
gsed -i 's/<Onega>/\Ω/g' $file | |
gsed -i 's/<phi-2>/\φ/g' $file | |
gsed -i 's/<hypertarget>diodeparameter<\/hypertarget>//g' $file | |
gsed -i 's/<pageref>03442.png<\/pageref>//g' $file | |
gsed -i 's/<points>/\<points\>/g' $file | |
output=`xidel ${file} --e "//article[@class='articlemain']" --input-format=html --output-format=xml` | |
echo $output > $file | |
tidy -q \ | |
--show-warnings false \ | |
--drop-proprietary-attributes true \ | |
--numeric-entities true \ | |
--add-xml-decl false \ | |
--hide-comments true \ | |
--doctype omit \ | |
-asxml \ | |
-modify \ | |
-indent \ | |
$file | |
output=`xml ed -N x=http://www.w3.org/1999/xhtml -d "//x:div[@align='google-ads']|//x:script|//x:div[@id='google-ads']|//x:ul[contains(@class,'breadcrumb')]" $file` | |
echo $output > $file | |
done | |
for i in {1..6}; | |
do | |
/Applications/calibre.app/Contents/MacOS/ebook-convert "www.allaboutcircuits.com/vol_${i}/index.html" "vol${i}.azw3" | |
done; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Tested on Mac OS X 10.9.
The
gsed
lines are necessary to clean-up some rogue markup in the source files.Notes:
gsed
is homebrew-installed GNU-sed, *nix users should just usesed
tidy
is the W3C HTML5 fork of the Tidy library (installed withbrew install --HEAD tidy
)xml
is XmlStarlet (brew install xmlstarlet
)xidel
andwget
also from homebrew (brew install xidel
andbrew install wget
respectively)