nawb/getall.py

## getall.py
'''
 Give a website
 Downloads all pdfs/pptx into the files/ folder.

 For the future:
  - download files as is, keeping directory structure
  - create own directory structure based on filenames (lec01 goes to /lec, q1/p1 to /p, etc)
'''

SAVETODIR="files/"

import sys, os
from urllib2 import *
from re import search, match, findall
from pprint import pprint
from time import sleep
try:
  baseurl = sys.argv[1]
except:
  print "Give a url"
  exit(-1)
hreftag = 'href\s*=\s*\"\s*(?P<url>\S+)\s*\">'
linktext = '(?P<text>.+?)'
tagend = '</a\s*>'

try:
    page = urlopen(baseurl)
    allfiles = findall(hreftag + linktext + tagend, page.read())
    if allfiles:
        pass
#        pprint(allfiles)
    else:
        print "Found nothing"
except HTTPError, e:
    print "HTTP Error:", e.code, url
except URLError, e:
    print "URL Error:", e.reason, url

print

def getFileName(filepath):
    if "/" in filepath:
        filename = filepath.split("/")[-1]
        return filename
    else:
        return filepath

def downloadFile(url, filename):
    u = urlopen(url)
    f = open(SAVETODIR+filename, 'wb')
    meta = u.info()
    file_size = int(meta.getheaders("Content-Length")[0])
    print "Downloading: %s Bytes: %s" % (filename, file_size)

    file_size_dl = 0
    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
        status = status + chr(8)*(len(status)+1)
        print status,

    f.close()

for thisfile in allfiles:
    text = thisfile[1]
    filepath = thisfile[0]
    filename = getFileName(filepath)

    if "http://" not in filepath: #if it's not a webpage
#        print("Downloading:  "+filepath+" ...")
        downloadFile(baseurl+filepath, filename)
        sleep(1)
	'''
	Give a website
	Downloads all pdfs/pptx into the files/ folder.

	For the future:
	- download files as is, keeping directory structure
	- create own directory structure based on filenames (lec01 goes to /lec, q1/p1 to /p, etc)
	'''

	SAVETODIR="files/"

	import sys, os
	from urllib2 import *
	from re import search, match, findall
	from pprint import pprint
	from time import sleep
	try:
	baseurl = sys.argv[1]
	except:
	print "Give a url"
	exit(-1)
	hreftag = 'href\s=\s\"\s(?P<url>\S+)\s\">'
	linktext = '(?P<text>.+?)'
	tagend = '</a\s*>'

	try:
	page = urlopen(baseurl)
	allfiles = findall(hreftag + linktext + tagend, page.read())
	if allfiles:
	pass
	# pprint(allfiles)
	else:
	print "Found nothing"
	except HTTPError, e:
	print "HTTP Error:", e.code, url
	except URLError, e:
	print "URL Error:", e.reason, url

	print

	def getFileName(filepath):
	if "/" in filepath:
	filename = filepath.split("/")[-1]
	return filename
	else:
	return filepath

	def downloadFile(url, filename):
	u = urlopen(url)
	f = open(SAVETODIR+filename, 'wb')
	meta = u.info()
	file_size = int(meta.getheaders("Content-Length")[0])
	print "Downloading: %s Bytes: %s" % (filename, file_size)

	file_size_dl = 0
	block_sz = 8192
	while True:
	buffer = u.read(block_sz)
	if not buffer:
	break

	file_size_dl += len(buffer)
	f.write(buffer)
	status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
	status = status + chr(8)*(len(status)+1)
	print status,

	f.close()

	for thisfile in allfiles:
	text = thisfile[1]
	filepath = thisfile[0]
	filename = getFileName(filepath)

	if "http://" not in filepath: #if it's not a webpage
	# print("Downloading: "+filepath+" ...")
	downloadFile(baseurl+filepath, filename)
	sleep(1)