Skip to content

Instantly share code, notes, and snippets.

@ideamonk
Created September 30, 2010 09:48
Show Gist options
  • Save ideamonk/604317 to your computer and use it in GitHub Desktop.
Save ideamonk/604317 to your computer and use it in GitHub Desktop.
import urllib2
from BeautifulSoup import BeautifulSoup
def safename(s):
return "".join([x for x in s if x.isalpha() or x.isdigit()])
print "Doing..."
mothers = [
# these pages contain links to pages to be saved
"http://www.foobarbeep.com/urlinks/foo1.html",
"http://www.foobarbeep.com/urlinks/foo2.html",
"http://www.foobarbeep.com/urlinks/foo3.html",
]
for mother in mothers:
scriptname = safename(mother.split('/')[-1])
shellscript = open("scrape_%s.sh" % scriptname ,"w")
soup = BeautifulSoup( urllib2.urlopen(mother).read() )
links = soup.findAll('a')
for link in links:
dirname = safename( link['href'].split('/')[-1] )
shellscript.write("wget --directory-prefix ./%s --no-parent --timestamping --convert-links --page-requisites --no-directories --no-host-directories -erobots=off %s\n\n" % (dirname, link['href']))
shellscript.write ("\ncd %s \nfor f in `find . | grep -e '\.asp$'`; do mv $f `echo $f.html`; done;" % (dirname))
shellscript.close()
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment