Created
April 13, 2019 12:56
-
-
Save Koenvh1/6386f8703766c432eb4dfa19acdb0244 to your computer and use it in GitHub Desktop.
Scrape your Canvas website and download all content to a folder. Tested on 2019-04-13.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import re | |
from pathvalidate import sanitize_filename | |
from canvasapi import Canvas | |
from canvasapi.course import Course | |
from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist | |
from canvasapi.file import File | |
from canvasapi.module import Module, ModuleItem | |
def extract_files(text): | |
text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE) | |
groups = set(text_search) | |
return groups | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Download all content from Canvas") | |
parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl") | |
parser.add_argument("token", help="Token generated in the settings page on Canvas") | |
parser.add_argument("output", help="Path to the output folder, e.g. output/") | |
args = parser.parse_args() | |
output = args.output.rstrip("/") + "/" | |
canvas = Canvas(args.url, args.token) | |
courses = canvas.get_courses() | |
for course in courses: | |
course: Course = course | |
modules = course.get_modules() | |
files_downloaded = set() | |
for module in modules: | |
module: Module = module | |
module_items = module.get_module_items() | |
for item in module_items: | |
item: ModuleItem = item | |
path = f"{output}/" \ | |
f"{sanitize_filename(course.attributes['name'])}/" \ | |
f"{sanitize_filename(module.attributes['name'])}/" | |
if not os.path.exists(path): | |
os.makedirs(path) | |
item_type = item.attributes["type"] | |
print(f"{course.attributes['name']} - " | |
f"{module.attributes['name']} - " | |
f"{item.attributes['title']} ({item_type})") | |
if item_type == "File": | |
file = canvas.get_file(item.attributes["content_id"]) | |
files_downloaded.add(item.attributes["content_id"]) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
elif item_type == "Page": | |
page = course.get_page(item.attributes["page_url"]) | |
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f: | |
f.write(page.attributes["body"] or "") | |
files = extract_files(page.attributes["body"] or "") | |
for file_id in files: | |
if file_id in files_downloaded: | |
continue | |
try: | |
file = course.get_file(file_id) | |
files_downloaded.add(file_id) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
except ResourceDoesNotExist: | |
pass | |
elif item_type == "ExternalUrl": | |
url = item.attributes["external_url"] | |
with open(path + sanitize_filename(item.attributes['title']) + ".url", "w") as f: | |
f.write("[InternetShortcut]\n") | |
f.write("URL=" + url) | |
elif item_type == "Assignment": | |
assignment = course.get_assignment(item.attributes["content_id"]) | |
with open(path + sanitize_filename(item.attributes['title']) + ".html", "w", encoding="utf-8") as f: | |
f.write(assignment.attributes["description"] or "") | |
files = extract_files(assignment.attributes["description"] or "") | |
for file_id in files: | |
if file_id in files_downloaded: | |
continue | |
try: | |
file = course.get_file(file_id) | |
files_downloaded.add(file_id) | |
file.download(path + sanitize_filename(file.attributes['filename'])) | |
except ResourceDoesNotExist: | |
pass | |
try: | |
files = course.get_files() | |
for file in files: | |
file: File = file | |
if not file.attributes["id"] in files_downloaded: | |
print(f"{course.attributes['name']} - {file.attributes['filename']}") | |
path = f"{output}/{sanitize_filename(course.attributes['name'])}/" \ | |
f"{sanitize_filename(file.attributes['filename'])}" | |
file.download(path) | |
except Unauthorized: | |
pass |
This is a great script! Worked for me and thank you to both Koenvh1 and john-hix :)
I made a few edits, namely one which is not working in scraping modules[20:25]. I receive an error for no Course with attributes
any idea why it would only work for the entire set of modules, instead of select few?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome stuff, thank you! I forked it over at GH. Added an argument to specify which courses to scrape.