Skip to content

Instantly share code, notes, and snippets.

@Koenvh1
Created April 13, 2019 12:55
Show Gist options
  • Save Koenvh1/80a5181c62d1d7c156c835263df58c12 to your computer and use it in GitHub Desktop.
Save Koenvh1/80a5181c62d1d7c156c835263df58c12 to your computer and use it in GitHub Desktop.
Scrape your Blackboard website and download all content to a folder. Tested with Blackboard Learn 3100.0.6.
import argparse
import os.path
import shutil
from urllib.parse import unquote
import pathvalidate
import requests_html
class Scraper:
def __init__(self, base_url: str, username: str, password: str, output_folder: str):
self.BASE_URL = base_url.rstrip("/")
self.USERNAME = username
self.PASSWORD = password
self.OUTPUT_FOLDER = output_folder.rstrip("/") + "/"
self.s = requests_html.HTMLSession()
self.s.post(self.BASE_URL + "/webapps/login/", {
"user_id": self.USERNAME,
"password": self.PASSWORD,
"login": "Login",
"action": "login",
"new_loc": ""
})
def get_course(self, course_id):
path = self.OUTPUT_FOLDER + course_id + "/"
downloaded_files = set()
to_visit_links = {self.BASE_URL + "/webapps/blackboard/content/listContent.jsp?course_id=" + course_id}
visited_links = set()
while len(to_visit_links) > 0:
new_link = to_visit_links.pop()
visited_links.add(new_link)
new_link = self.expand_link(new_link)
print(f"GET {new_link} (to visit {len(to_visit_links)}, visited {len(visited_links)})")
page = self.s.get(new_link)
links = page.html.links
title = page.html.find("title", first=True).text.split("–")[0] # Not a -, but a – (different character)
local_path = path + pathvalidate.sanitize_filename(title) + "/"
if not os.path.exists(local_path):
os.makedirs(local_path)
with open(local_path + "index.html", "w", encoding="utf-8") as f:
f.write(page.text)
for link in links:
expanded_link = self.expand_link(link)
if expanded_link in visited_links:
continue
if "listContent.jsp?course_id=" + course_id in link:
to_visit_links.add(expanded_link)
if "bbcswebdav" in link and link not in downloaded_files:
print("DOWNLOAD " + link)
self.download_file(link, local_path)
downloaded_files.add(link)
print("Done.")
def download_file(self, link, path):
if not os.path.exists(path):
os.makedirs(path)
r = self.s.get(self.expand_link(link), allow_redirects=True, stream=True)
local_filename = r.url.split('/')[-1]
local_filename = unquote(local_filename)
with open(path + local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
def expand_link(self, link):
if link.startswith("/"):
link = self.BASE_URL + link
return link
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download all content from Blackboard")
parser.add_argument("url", help="URL to the Blackboard website, e.g. https://blackboard.utwente.nl")
parser.add_argument("username", help="Username used to log in to Blackboard")
parser.add_argument("password", help="Password used to log in to Blackboard")
parser.add_argument("output", help="Path to the output folder, e.g. output/")
parser.add_argument("course_id", nargs="+",
help="ID for the course to parse, can be found in the page URL, and looks like _xxxxx_1")
args = parser.parse_args()
s = Scraper(args.url, args.username, args.password, args.output)
for i in args.course_id:
s.get_course(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment