Koenvh1/blackboard-scraper.py

## blackboard-scraper.py
import argparse
import os.path
import shutil
from urllib.parse import unquote

import pathvalidate
import requests_html


class Scraper:

    def __init__(self, base_url: str, username: str, password: str, output_folder: str):
        self.BASE_URL = base_url.rstrip("/")
        self.USERNAME = username
        self.PASSWORD = password
        self.OUTPUT_FOLDER = output_folder.rstrip("/") + "/"

        self.s = requests_html.HTMLSession()
        self.s.post(self.BASE_URL + "/webapps/login/", {
            "user_id": self.USERNAME,
            "password": self.PASSWORD,
            "login": "Login",
            "action": "login",
            "new_loc": ""
        })

    def get_course(self, course_id):
        path = self.OUTPUT_FOLDER + course_id + "/"

        downloaded_files = set()
        to_visit_links = {self.BASE_URL + "/webapps/blackboard/content/listContent.jsp?course_id=" + course_id}
        visited_links = set()

        while len(to_visit_links) > 0:
            new_link = to_visit_links.pop()
            visited_links.add(new_link)
            new_link = self.expand_link(new_link)
            print(f"GET {new_link} (to visit {len(to_visit_links)}, visited {len(visited_links)})")
            page = self.s.get(new_link)
            links = page.html.links
            title = page.html.find("title", first=True).text.split("–")[0]  # Not a -, but a – (different character)

            local_path = path + pathvalidate.sanitize_filename(title) + "/"
            if not os.path.exists(local_path):
                os.makedirs(local_path)
            with open(local_path + "index.html", "w", encoding="utf-8") as f:
                f.write(page.text)

            for link in links:
                expanded_link = self.expand_link(link)
                if expanded_link in visited_links:
                    continue
                if "listContent.jsp?course_id=" + course_id in link:
                    to_visit_links.add(expanded_link)
                if "bbcswebdav" in link and link not in downloaded_files:
                    print("DOWNLOAD " + link)
                    self.download_file(link, local_path)
                    downloaded_files.add(link)

        print("Done.")

    def download_file(self, link, path):
        if not os.path.exists(path):
            os.makedirs(path)

        r = self.s.get(self.expand_link(link), allow_redirects=True, stream=True)
        local_filename = r.url.split('/')[-1]
        local_filename = unquote(local_filename)
        with open(path + local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    def expand_link(self, link):
        if link.startswith("/"):
            link = self.BASE_URL + link
        return link


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download all content from Blackboard")
    parser.add_argument("url", help="URL to the Blackboard website, e.g. https://blackboard.utwente.nl")
    parser.add_argument("username", help="Username used to log in to Blackboard")
    parser.add_argument("password", help="Password used to log in to Blackboard")
    parser.add_argument("output", help="Path to the output folder, e.g. output/")
    parser.add_argument("course_id", nargs="+",
                        help="ID for the course to parse, can be found in the page URL, and looks like _xxxxx_1")

    args = parser.parse_args()
    s = Scraper(args.url, args.username, args.password, args.output)
    for i in args.course_id:
        s.get_course(i)
	import argparse
	import os.path
	import shutil
	from urllib.parse import unquote

	import pathvalidate
	import requests_html


	class Scraper:

	def __init__(self, base_url: str, username: str, password: str, output_folder: str):
	self.BASE_URL = base_url.rstrip("/")
	self.USERNAME = username
	self.PASSWORD = password
	self.OUTPUT_FOLDER = output_folder.rstrip("/") + "/"

	self.s = requests_html.HTMLSession()
	self.s.post(self.BASE_URL + "/webapps/login/", {
	"user_id": self.USERNAME,
	"password": self.PASSWORD,
	"login": "Login",
	"action": "login",
	"new_loc": ""
	})

	def get_course(self, course_id):
	path = self.OUTPUT_FOLDER + course_id + "/"

	downloaded_files = set()
	to_visit_links = {self.BASE_URL + "/webapps/blackboard/content/listContent.jsp?course_id=" + course_id}
	visited_links = set()

	while len(to_visit_links) > 0:
	new_link = to_visit_links.pop()
	visited_links.add(new_link)
	new_link = self.expand_link(new_link)
	print(f"GET {new_link} (to visit {len(to_visit_links)}, visited {len(visited_links)})")
	page = self.s.get(new_link)
	links = page.html.links
	title = page.html.find("title", first=True).text.split("–")[0] # Not a -, but a – (different character)

	local_path = path + pathvalidate.sanitize_filename(title) + "/"
	if not os.path.exists(local_path):
	os.makedirs(local_path)
	with open(local_path + "index.html", "w", encoding="utf-8") as f:
	f.write(page.text)

	for link in links:
	expanded_link = self.expand_link(link)
	if expanded_link in visited_links:
	continue
	if "listContent.jsp?course_id=" + course_id in link:
	to_visit_links.add(expanded_link)
	if "bbcswebdav" in link and link not in downloaded_files:
	print("DOWNLOAD " + link)
	self.download_file(link, local_path)
	downloaded_files.add(link)

	print("Done.")

	def download_file(self, link, path):
	if not os.path.exists(path):
	os.makedirs(path)

	r = self.s.get(self.expand_link(link), allow_redirects=True, stream=True)
	local_filename = r.url.split('/')[-1]
	local_filename = unquote(local_filename)
	with open(path + local_filename, 'wb') as f:
	shutil.copyfileobj(r.raw, f)

	def expand_link(self, link):
	if link.startswith("/"):
	link = self.BASE_URL + link
	return link


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Download all content from Blackboard")
	parser.add_argument("url", help="URL to the Blackboard website, e.g. https://blackboard.utwente.nl")
	parser.add_argument("username", help="Username used to log in to Blackboard")
	parser.add_argument("password", help="Password used to log in to Blackboard")
	parser.add_argument("output", help="Path to the output folder, e.g. output/")
	parser.add_argument("course_id", nargs="+",
	help="ID for the course to parse, can be found in the page URL, and looks like _xxxxx_1")

	args = parser.parse_args()
	s = Scraper(args.url, args.username, args.password, args.output)
	for i in args.course_id:
	s.get_course(i)