Last active
August 29, 2015 14:22
-
-
Save bzinberg/4c3e4a322f519681c222 to your computer and use it in GitHub Desktop.
Scraper to get the materials from zhongwenred.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import sys | |
import urlparse | |
import os | |
import errno | |
# Gives fancy display string for links | |
def linkf(link): | |
return '"%s" (%s)' % (link.text, link.attrib['href']) | |
# Thanks to Heikki Toivonen | |
# http://stackoverflow.com/questions/273192/in-python-check-if-a-directory-exists-and-create-it-if-necessary | |
def make_sure_path_exists(path): | |
try: | |
os.makedirs(path) | |
except OSError as exception: | |
if exception.errno != errno.EEXIST: | |
raise | |
# Thanks to Roman Podlinov | |
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py | |
def download_file(url, target_path): | |
r = requests.get(url, stream=True) | |
with open(target_path, 'wb') as outfile: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
outfile.write(chunk) | |
outfile.flush() | |
if __name__ == '__main__': | |
home_url = 'http://www.zhongwenred.com/index.htm' | |
try: | |
home_page = requests.get(home_url) | |
except Exception as e: | |
print "Unable to get page %s:" % (home_url,) | |
print e | |
sys.exit() | |
# Scrape links to collection pages (e.g., "Lessons 51-60") | |
home_tree = html.fromstring(home_page.text) | |
collection_page_links = home_tree.xpath("//a[contains(., 'Lessons ')]") | |
to_download = [] | |
for clink in collection_page_links: | |
target = urlparse.urljoin(home_url, clink.attrib['href']) | |
print "Clicking link %s" % (linkf(clink),) | |
try: | |
page = requests.get(target) | |
except Exception as e: | |
print "Download failed, skipping." | |
continue | |
tree = html.fromstring(page.text) | |
lesson_links = tree.xpath('//a[@class="lessonlinks"]') | |
print 'Contains %d sublinks' % (len(lesson_links),) | |
# Scrape the links to each individual lesson on this page (e.g., | |
# "Lesson 57: How's the weather") | |
for llink in lesson_links: | |
ltarget = urlparse.urljoin(home_url, llink.attrib['href']) | |
print " Clicking sublink %s..." % (linkf(llink),) | |
try: | |
lpage = requests.get(ltarget) | |
except Exception as e: | |
print " Download failed, skipping." | |
continue | |
ltree = html.fromstring(lpage.text) | |
# Scrape content links from this page (pdfs and mp3s) | |
pdflinks = ltree.xpath('//a[contains(@href, "pdf")]') | |
mp3links = ltree.xpath('//a[contains(@href, "mp3")]') | |
# Where to store the files on this page: | |
dirname = llink.text.replace('/', '') | |
for link in pdflinks + mp3links: | |
# Make up a local filename | |
url = urlparse.urljoin(lpage.url, link.attrib['href']) | |
pbname = os.path.basename(urlparse.urlparse(url).path) | |
# Target path (including filename and directory) | |
target_path = os.path.join(dirname, pbname) | |
print " Adding link %s <- %s" % (target_path, linkf(link)) | |
to_download.append((url, target_path)) | |
# The right thing to do here would probably be to save to_download to a | |
# file, to simplify things if the script gets interrupted. But I'll not | |
# bother with that since I'm only doing this once and can handle the risk | |
# of starting over. | |
# Download all the files we said we'd download | |
for (url, target_path) in to_download: | |
parent, basename = os.path.split(target_path) | |
make_sure_path_exists(parent) | |
print "Downloading file %s <-- %s" % (target_path, url) | |
download_file(url, target_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment