Skip to content

Instantly share code, notes, and snippets.

@nawb
Created September 17, 2016 16:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nawb/1ba0f8b9e11f02d3a2dc05e7bbd64db7 to your computer and use it in GitHub Desktop.
Save nawb/1ba0f8b9e11f02d3a2dc05e7bbd64db7 to your computer and use it in GitHub Desktop.
Downloads all course content from a url
'''
Give a website
Downloads all pdfs/pptx into the files/ folder.
For the future:
- download files as is, keeping directory structure
- create own directory structure based on filenames (lec01 goes to /lec, q1/p1 to /p, etc)
'''
SAVETODIR="files/"
import sys, os
from urllib2 import *
from re import search, match, findall
from pprint import pprint
from time import sleep
try:
baseurl = sys.argv[1]
except:
print "Give a url"
exit(-1)
hreftag = 'href\s*=\s*\"\s*(?P<url>\S+)\s*\">'
linktext = '(?P<text>.+?)'
tagend = '</a\s*>'
try:
page = urlopen(baseurl)
allfiles = findall(hreftag + linktext + tagend, page.read())
if allfiles:
pass
# pprint(allfiles)
else:
print "Found nothing"
except HTTPError, e:
print "HTTP Error:", e.code, url
except URLError, e:
print "URL Error:", e.reason, url
print
def getFileName(filepath):
if "/" in filepath:
filename = filepath.split("/")[-1]
return filename
else:
return filepath
def downloadFile(url, filename):
u = urlopen(url)
f = open(SAVETODIR+filename, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (filename, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
print status,
f.close()
for thisfile in allfiles:
text = thisfile[1]
filepath = thisfile[0]
filename = getFileName(filepath)
if "http://" not in filepath: #if it's not a webpage
# print("Downloading: "+filepath+" ...")
downloadFile(baseurl+filepath, filename)
sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment