KartikTalwar/Desire 2 Download - Download Fix.py

## d2dfix.py
## This line should fix the double (sometimes triple) fetching of the courses
## https://github.com/sholiday/desire2download/blob/master/desire2download.py#L79

# The reason why multiple links are detected is because of the course announcements
# on the homepage that link to the course page

"""
Special Cases :

- PHYS 10 - Spring 2012
- PHYS 260A - Spring 2012
- AMATH 251/PMATH 332 - Spring 2012
"""


    def get_course_links(self):
        print 'Finding courses...'
        links = []
        track = []
        for link in self.br.links():
            matches = re.match('[[A-Z]+ [0-9A-Z/\s]{2,15} - [A-Z][a-z]+ 20[0-9]{2}', link.text) # new as well
            if matches is not None:
                if link.text not in track:
                    links.append(link)
                    track.append(link.text)
        return links


## Desire 2 Download - Download Fix.py
# Fixes the issue where the referenced content is not a downloadable file and/or does not exist

"""
# Bug:
- https://learn.uwaterloo.cahttp://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html
- https://learn.uwaterloo.cahttp://phet.colorado.edu/new/simulations/sims.php
- https://learn.uwaterloo.cahttp://paws.kettering.edu/~drussell/Demos/wave-x-t/wave-x-t.html
- URL to a file that does not exist

# After Fix:
- http://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html
- 'X File does not exist: Aberration_correction_2010.pdf'
"""


import urllib2

    def download_file(self, title, url, path):
        """Downloads a file to the specified directory.

        Args:
            title (str): Name of the file.
            url (str): Address to the file preview page.
            path (str): Relative path of file to make.
        """
        try:
            os.makedirs(path)
        except OSError as e:
            if e.errno != 17:
                raise e
            pass

        page = self.br.open(url).read()
        soup = BeautifulSoup.BeautifulSoup(page)
        url = soup.find('iframe')['src']

        ## TODO: How should this be handled. These seem to be custom pages
        ## with content loaded via javascript, at least this one
        ## url = https://learn.uwaterloo.ca/d2l/lor/viewer/view.d2l?ou=16733&loId=0&loIdentId=245
        if '/d2l/common/dialogs/' in url or \
            'https://learn.uwaterloo.ca/d2l/lor/viewer' in url:
            print " X Unable to download web-only content %s" % title
            return

        url_path = url.split('?')[0]
        if url_path.find('http') == 0:  # new
            clean_url = url_path  #new
        else: #new
            clean_url = 'https://learn.uwaterloo.ca%s' % url_path #new
        clean_url = clean_url.replace(' ', '%20')
        file_name = os.path.split(url_path)[1]
        for r in self.ignore_re:
            if r.match(file_name) is not None:
                print 'Skipping %s because it matches ignore regex "%s"' % (file_name, r.pattern)
                return

        path_and_filename = '%s/%s' % (path, file_name.strip('/'))
        if os.path.isfile(path_and_filename):  ## TODO Can we make this smarter?
            print ' - %s (Already Saved)' % path_and_filename
        else:
        # new from here
            try:
                content = self.br.open_novisit(clean_url).read()
            except urllib2.HTTPError, e:
                if e.code == 404:
                    print " X File does not exist: %s" % file_name.strip('/')
            else:
                print ' + %s (%s)' % (path_and_filename, self.convert_bytes(len(content)))
                with open(path_and_filename, 'w') as f:
                    f.write(content)
	## This line should fix the double (sometimes triple) fetching of the courses
	## https://github.com/sholiday/desire2download/blob/master/desire2download.py#L79

	# The reason why multiple links are detected is because of the course announcements
	# on the homepage that link to the course page

	"""
	Special Cases :

	- PHYS 10 - Spring 2012
	- PHYS 260A - Spring 2012
	- AMATH 251/PMATH 332 - Spring 2012
	"""


	def get_course_links(self):
	print 'Finding courses...'
	links = []
	track = []
	for link in self.br.links():
	matches = re.match('[[A-Z]+ [0-9A-Z/\s]{2,15} - [A-Z][a-z]+ 20[0-9]{2}', link.text) # new as well
	if matches is not None:
	if link.text not in track:
	links.append(link)
	track.append(link.text)
	return links
	# Fixes the issue where the referenced content is not a downloadable file and/or does not exist

	"""
	# Bug:
	- https://learn.uwaterloo.cahttp://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html
	- https://learn.uwaterloo.cahttp://phet.colorado.edu/new/simulations/sims.php
	- https://learn.uwaterloo.cahttp://paws.kettering.edu/~drussell/Demos/wave-x-t/wave-x-t.html
	- URL to a file that does not exist

	# After Fix:
	- http://cte.uwaterloo.ca/teaching_resources/tips/online_discussions_tips_for_students.html
	- 'X File does not exist: Aberration_correction_2010.pdf'
	"""


	import urllib2

	def download_file(self, title, url, path):
	"""Downloads a file to the specified directory.

	Args:
	title (str): Name of the file.
	url (str): Address to the file preview page.
	path (str): Relative path of file to make.
	"""
	try:
	os.makedirs(path)
	except OSError as e:
	if e.errno != 17:
	raise e
	pass

	page = self.br.open(url).read()
	soup = BeautifulSoup.BeautifulSoup(page)
	url = soup.find('iframe')['src']

	## TODO: How should this be handled. These seem to be custom pages
	## with content loaded via javascript, at least this one
	## url = https://learn.uwaterloo.ca/d2l/lor/viewer/view.d2l?ou=16733&loId=0&loIdentId=245
	if '/d2l/common/dialogs/' in url or \
	'https://learn.uwaterloo.ca/d2l/lor/viewer' in url:
	print " X Unable to download web-only content %s" % title
	return

	url_path = url.split('?')[0]
	if url_path.find('http') == 0: # new
	clean_url = url_path #new
	else: #new
	clean_url = 'https://learn.uwaterloo.ca%s' % url_path #new
	clean_url = clean_url.replace(' ', '%20')
	file_name = os.path.split(url_path)[1]
	for r in self.ignore_re:
	if r.match(file_name) is not None:
	print 'Skipping %s because it matches ignore regex "%s"' % (file_name, r.pattern)
	return

	path_and_filename = '%s/%s' % (path, file_name.strip('/'))
	if os.path.isfile(path_and_filename): ## TODO Can we make this smarter?
	print ' - %s (Already Saved)' % path_and_filename
	else:
	# new from here
	try:
	content = self.br.open_novisit(clean_url).read()
	except urllib2.HTTPError, e:
	if e.code == 404:
	print " X File does not exist: %s" % file_name.strip('/')
	else:
	print ' + %s (%s)' % (path_and_filename, self.convert_bytes(len(content)))
	with open(path_and_filename, 'w') as f:
	f.write(content)