Created
June 9, 2012 04:13
-
-
Save KartikTalwar/2899403 to your computer and use it in GitHub Desktop.
Desire 2 Download Fix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## This line should fix the double (sometimes triple) fetching of the courses | |
## https://github.com/sholiday/desire2download/blob/master/desire2download.py#L79 | |
# The reason why multiple links are detected is because of the course announcements | |
# on the homepage that link to the course page | |
""" | |
Special Cases : | |
- PHYS 10 - Spring 2012 | |
- PHYS 260A - Spring 2012 | |
- AMATH 251/PMATH 332 - Spring 2012 | |
""" | |
def get_course_links(self): | |
print 'Finding courses...' | |
links = [] | |
track = [] | |
for link in self.br.links(): | |
matches = re.match('[[A-Z]+ [0-9A-Z/\s]{2,15} - [A-Z][a-z]+ 20[0-9]{2}', link.text) # new as well | |
if matches is not None: | |
if link.text not in track: | |
links.append(link) | |
track.append(link.text) | |
return links | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fixes the issue where the referenced content is not a downloadable file and/or does not exist | |
""" | |
# Bug: | |
- https://learn.uwaterloo.cahttp://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html | |
- https://learn.uwaterloo.cahttp://phet.colorado.edu/new/simulations/sims.php | |
- https://learn.uwaterloo.cahttp://paws.kettering.edu/~drussell/Demos/wave-x-t/wave-x-t.html | |
- URL to a file that does not exist | |
# After Fix: | |
- http://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html | |
- 'X File does not exist: Aberration_correction_2010.pdf' | |
""" | |
import urllib2 | |
def download_file(self, title, url, path): | |
"""Downloads a file to the specified directory. | |
Args: | |
title (str): Name of the file. | |
url (str): Address to the file preview page. | |
path (str): Relative path of file to make. | |
""" | |
try: | |
os.makedirs(path) | |
except OSError as e: | |
if e.errno != 17: | |
raise e | |
pass | |
page = self.br.open(url).read() | |
soup = BeautifulSoup.BeautifulSoup(page) | |
url = soup.find('iframe')['src'] | |
## TODO: How should this be handled. These seem to be custom pages | |
## with content loaded via javascript, at least this one | |
## url = https://learn.uwaterloo.ca/d2l/lor/viewer/view.d2l?ou=16733&loId=0&loIdentId=245 | |
if '/d2l/common/dialogs/' in url or \ | |
'https://learn.uwaterloo.ca/d2l/lor/viewer' in url: | |
print " X Unable to download web-only content %s" % title | |
return | |
url_path = url.split('?')[0] | |
if url_path.find('http') == 0: # new | |
clean_url = url_path #new | |
else: #new | |
clean_url = 'https://learn.uwaterloo.ca%s' % url_path #new | |
clean_url = clean_url.replace(' ', '%20') | |
file_name = os.path.split(url_path)[1] | |
for r in self.ignore_re: | |
if r.match(file_name) is not None: | |
print 'Skipping %s because it matches ignore regex "%s"' % (file_name, r.pattern) | |
return | |
path_and_filename = '%s/%s' % (path, file_name.strip('/')) | |
if os.path.isfile(path_and_filename): ## TODO Can we make this smarter? | |
print ' - %s (Already Saved)' % path_and_filename | |
else: | |
# new from here | |
try: | |
content = self.br.open_novisit(clean_url).read() | |
except urllib2.HTTPError, e: | |
if e.code == 404: | |
print " X File does not exist: %s" % file_name.strip('/') | |
else: | |
print ' + %s (%s)' % (path_and_filename, self.convert_bytes(len(content))) | |
with open(path_and_filename, 'w') as f: | |
f.write(content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment