Skip to content

Instantly share code, notes, and snippets.

@bzinberg
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bzinberg/4c3e4a322f519681c222 to your computer and use it in GitHub Desktop.
Save bzinberg/4c3e4a322f519681c222 to your computer and use it in GitHub Desktop.
Scraper to get the materials from zhongwenred.com
from lxml import html
import requests
import sys
import urlparse
import os
import errno
# Gives fancy display string for links
def linkf(link):
return '"%s" (%s)' % (link.text, link.attrib['href'])
# Thanks to Heikki Toivonen
# http://stackoverflow.com/questions/273192/in-python-check-if-a-directory-exists-and-create-it-if-necessary
def make_sure_path_exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
# Thanks to Roman Podlinov
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
def download_file(url, target_path):
r = requests.get(url, stream=True)
with open(target_path, 'wb') as outfile:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
outfile.write(chunk)
outfile.flush()
if __name__ == '__main__':
home_url = 'http://www.zhongwenred.com/index.htm'
try:
home_page = requests.get(home_url)
except Exception as e:
print "Unable to get page %s:" % (home_url,)
print e
sys.exit()
# Scrape links to collection pages (e.g., "Lessons 51-60")
home_tree = html.fromstring(home_page.text)
collection_page_links = home_tree.xpath("//a[contains(., 'Lessons ')]")
to_download = []
for clink in collection_page_links:
target = urlparse.urljoin(home_url, clink.attrib['href'])
print "Clicking link %s" % (linkf(clink),)
try:
page = requests.get(target)
except Exception as e:
print "Download failed, skipping."
continue
tree = html.fromstring(page.text)
lesson_links = tree.xpath('//a[@class="lessonlinks"]')
print 'Contains %d sublinks' % (len(lesson_links),)
# Scrape the links to each individual lesson on this page (e.g.,
# "Lesson 57: How's the weather")
for llink in lesson_links:
ltarget = urlparse.urljoin(home_url, llink.attrib['href'])
print " Clicking sublink %s..." % (linkf(llink),)
try:
lpage = requests.get(ltarget)
except Exception as e:
print " Download failed, skipping."
continue
ltree = html.fromstring(lpage.text)
# Scrape content links from this page (pdfs and mp3s)
pdflinks = ltree.xpath('//a[contains(@href, "pdf")]')
mp3links = ltree.xpath('//a[contains(@href, "mp3")]')
# Where to store the files on this page:
dirname = llink.text.replace('/', '')
for link in pdflinks + mp3links:
# Make up a local filename
url = urlparse.urljoin(lpage.url, link.attrib['href'])
pbname = os.path.basename(urlparse.urlparse(url).path)
# Target path (including filename and directory)
target_path = os.path.join(dirname, pbname)
print " Adding link %s <- %s" % (target_path, linkf(link))
to_download.append((url, target_path))
# The right thing to do here would probably be to save to_download to a
# file, to simplify things if the script gets interrupted. But I'll not
# bother with that since I'm only doing this once and can handle the risk
# of starting over.
# Download all the files we said we'd download
for (url, target_path) in to_download:
parent, basename = os.path.split(target_path)
make_sure_path_exists(parent)
print "Downloading file %s <-- %s" % (target_path, url)
download_file(url, target_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment