Created
September 17, 2016 16:23
-
-
Save nawb/1ba0f8b9e11f02d3a2dc05e7bbd64db7 to your computer and use it in GitHub Desktop.
Downloads all course content from a url
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Give a website | |
Downloads all pdfs/pptx into the files/ folder. | |
For the future: | |
- download files as is, keeping directory structure | |
- create own directory structure based on filenames (lec01 goes to /lec, q1/p1 to /p, etc) | |
''' | |
SAVETODIR="files/" | |
import sys, os | |
from urllib2 import * | |
from re import search, match, findall | |
from pprint import pprint | |
from time import sleep | |
try: | |
baseurl = sys.argv[1] | |
except: | |
print "Give a url" | |
exit(-1) | |
hreftag = 'href\s*=\s*\"\s*(?P<url>\S+)\s*\">' | |
linktext = '(?P<text>.+?)' | |
tagend = '</a\s*>' | |
try: | |
page = urlopen(baseurl) | |
allfiles = findall(hreftag + linktext + tagend, page.read()) | |
if allfiles: | |
pass | |
# pprint(allfiles) | |
else: | |
print "Found nothing" | |
except HTTPError, e: | |
print "HTTP Error:", e.code, url | |
except URLError, e: | |
print "URL Error:", e.reason, url | |
def getFileName(filepath): | |
if "/" in filepath: | |
filename = filepath.split("/")[-1] | |
return filename | |
else: | |
return filepath | |
def downloadFile(url, filename): | |
u = urlopen(url) | |
f = open(SAVETODIR+filename, 'wb') | |
meta = u.info() | |
file_size = int(meta.getheaders("Content-Length")[0]) | |
print "Downloading: %s Bytes: %s" % (filename, file_size) | |
file_size_dl = 0 | |
block_sz = 8192 | |
while True: | |
buffer = u.read(block_sz) | |
if not buffer: | |
break | |
file_size_dl += len(buffer) | |
f.write(buffer) | |
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) | |
status = status + chr(8)*(len(status)+1) | |
print status, | |
f.close() | |
for thisfile in allfiles: | |
text = thisfile[1] | |
filepath = thisfile[0] | |
filename = getFileName(filepath) | |
if "http://" not in filepath: #if it's not a webpage | |
# print("Downloading: "+filepath+" ...") | |
downloadFile(baseurl+filepath, filename) | |
sleep(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment