bzinberg/zhred.py

## zhred.py
from lxml import html
import requests
import sys
import urlparse
import os
import errno

# Gives fancy display string for links
def linkf(link):
    return '"%s" (%s)' % (link.text, link.attrib['href'])

# Thanks to Heikki Toivonen
# http://stackoverflow.com/questions/273192/in-python-check-if-a-directory-exists-and-create-it-if-necessary
def make_sure_path_exists(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

# Thanks to Roman Podlinov
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
def download_file(url, target_path):
    r = requests.get(url, stream=True)
    with open(target_path, 'wb') as outfile:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                outfile.write(chunk)
                outfile.flush()

if __name__ == '__main__':
    home_url = 'http://www.zhongwenred.com/index.htm'
    try:
        home_page = requests.get(home_url)
    except Exception as e:
        print "Unable to get page %s:" % (home_url,)
        print e
        sys.exit()

    # Scrape links to collection pages (e.g., "Lessons 51-60")
    home_tree = html.fromstring(home_page.text)
    collection_page_links = home_tree.xpath("//a[contains(., 'Lessons ')]")
    to_download = []
    for clink in collection_page_links:
        target = urlparse.urljoin(home_url, clink.attrib['href'])
        print "Clicking link %s" % (linkf(clink),)
        try:
            page = requests.get(target)
        except Exception as e:
            print "Download failed, skipping."
            continue
        tree = html.fromstring(page.text)
        lesson_links = tree.xpath('//a[@class="lessonlinks"]')
        print 'Contains %d sublinks' % (len(lesson_links),)
        # Scrape the links to each individual lesson on this page (e.g.,
        # "Lesson 57: How's the weather")
        for llink in lesson_links:
            ltarget = urlparse.urljoin(home_url, llink.attrib['href'])
            print "  Clicking sublink %s..." % (linkf(llink),)
            try:
                lpage = requests.get(ltarget)
            except Exception as e:
                print "  Download failed, skipping."
                continue
            ltree = html.fromstring(lpage.text)

            # Scrape content links from this page (pdfs and mp3s)
            pdflinks = ltree.xpath('//a[contains(@href, "pdf")]')
            mp3links = ltree.xpath('//a[contains(@href, "mp3")]')

            # Where to store the files on this page:
            dirname = llink.text.replace('/', '')
            for link in pdflinks + mp3links:
                # Make up a local filename
                url = urlparse.urljoin(lpage.url, link.attrib['href'])
                pbname = os.path.basename(urlparse.urlparse(url).path)
                # Target path (including filename and directory)
                target_path = os.path.join(dirname, pbname)
                print "    Adding link %s <- %s" % (target_path, linkf(link))
                to_download.append((url, target_path))

    # The right thing to do here would probably be to save to_download to a
    # file, to simplify things if the script gets interrupted.  But I'll not
    # bother with that since I'm only doing this once and can handle the risk
    # of starting over.

    # Download all the files we said we'd download
    for (url, target_path) in to_download:
        parent, basename = os.path.split(target_path)
        make_sure_path_exists(parent)
        print "Downloading file %s <-- %s" % (target_path, url)
        download_file(url, target_path)
	from lxml import html
	import requests
	import sys
	import urlparse
	import os
	import errno

	# Gives fancy display string for links
	def linkf(link):
	return '"%s" (%s)' % (link.text, link.attrib['href'])

	# Thanks to Heikki Toivonen
	# http://stackoverflow.com/questions/273192/in-python-check-if-a-directory-exists-and-create-it-if-necessary
	def make_sure_path_exists(path):
	try:
	os.makedirs(path)
	except OSError as exception:
	if exception.errno != errno.EEXIST:
	raise

	# Thanks to Roman Podlinov
	# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
	def download_file(url, target_path):
	r = requests.get(url, stream=True)
	with open(target_path, 'wb') as outfile:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk: # filter out keep-alive new chunks
	outfile.write(chunk)
	outfile.flush()

	if __name__ == '__main__':
	home_url = 'http://www.zhongwenred.com/index.htm'
	try:
	home_page = requests.get(home_url)
	except Exception as e:
	print "Unable to get page %s:" % (home_url,)
	print e
	sys.exit()

	# Scrape links to collection pages (e.g., "Lessons 51-60")
	home_tree = html.fromstring(home_page.text)
	collection_page_links = home_tree.xpath("//a[contains(., 'Lessons ')]")
	to_download = []
	for clink in collection_page_links:
	target = urlparse.urljoin(home_url, clink.attrib['href'])
	print "Clicking link %s" % (linkf(clink),)
	try:
	page = requests.get(target)
	except Exception as e:
	print "Download failed, skipping."
	continue
	tree = html.fromstring(page.text)
	lesson_links = tree.xpath('//a[@class="lessonlinks"]')
	print 'Contains %d sublinks' % (len(lesson_links),)
	# Scrape the links to each individual lesson on this page (e.g.,
	# "Lesson 57: How's the weather")
	for llink in lesson_links:
	ltarget = urlparse.urljoin(home_url, llink.attrib['href'])
	print " Clicking sublink %s..." % (linkf(llink),)
	try:
	lpage = requests.get(ltarget)
	except Exception as e:
	print " Download failed, skipping."
	continue
	ltree = html.fromstring(lpage.text)

	# Scrape content links from this page (pdfs and mp3s)
	pdflinks = ltree.xpath('//a[contains(@href, "pdf")]')
	mp3links = ltree.xpath('//a[contains(@href, "mp3")]')

	# Where to store the files on this page:
	dirname = llink.text.replace('/', '')
	for link in pdflinks + mp3links:
	# Make up a local filename
	url = urlparse.urljoin(lpage.url, link.attrib['href'])
	pbname = os.path.basename(urlparse.urlparse(url).path)
	# Target path (including filename and directory)
	target_path = os.path.join(dirname, pbname)
	print " Adding link %s <- %s" % (target_path, linkf(link))
	to_download.append((url, target_path))

	# The right thing to do here would probably be to save to_download to a
	# file, to simplify things if the script gets interrupted. But I'll not
	# bother with that since I'm only doing this once and can handle the risk
	# of starting over.

	# Download all the files we said we'd download
	for (url, target_path) in to_download:
	parent, basename = os.path.split(target_path)
	make_sure_path_exists(parent)
	print "Downloading file %s <-- %s" % (target_path, url)
	download_file(url, target_path)