ivanistheone/excerpt from ck-12 sraper.py

## excerpt from ck-12 sraper.py
def retrieve_flex_book_lesson(item):

    html_url = "/flx/get/perma/modality/lesson/{lesson_id}/{domain_id}?format=html".format(
        lesson_id=item["handle"],
        domain_id=item["domain"]["encodedID"],
    )

    dst = tempfile.mkdtemp()

    try:
        download_ck12_file(
            html_url,
            dst,
            filename="index.html",
            middleware_callbacks=[process_flex_book],
        )
    except TooManyLinksToBotherIncluding:
        return None
    except ContainsNonfreeImage:
        return None

    shutil.copy("resources/flex-styles.css", dst)
    download_mathjax(dst)

    # preview_in_browser(dst)

    zippath = create_predictable_zip(dst)
    node = HTML5AppNode(files=[HTMLZipFile(zippath)], **shared_node_attributes(item, title_suffix=" (Flexbook)"))

    return node


def process_flex_book(content, baseurl, destpath, **kwargs):

    html = BeautifulSoup(content, "html.parser")

    # some pages are basically just a list of links; since we strip links out, no use keeping these
    ratio = len(" ".join([a.text for a in html.find_all("a")])) / len(html.text)
    if ratio > 0.1:
        raise TooManyLinksToBotherIncluding()

    # download all the images in the page, and replace the img src's to make them local
    for img in html.find_all("img"):
        if img.get("data-flx-url"):
            filename = urlparse(img.get("data-flx-url")).path.split("/")[-1]
            del img["data-flx-url"]
        else:
            filename = hashlib.md5(img["src"].encode()).hexdigest() + ".png"

        # check for license comments preceding the image
        license = ""
        prev = img.previous
        for i in range(8):
            if "@@license" in prev:
                license = prev.split('"')[1].lower()
                break
            prev = prev.previous
        next = img.next
        for i in range(8):
            if "@@license" in next:
                license = next.split('"')[1].lower()
                break
            next = next.next

        # check whether the image is marked as under a non-open license, and abort if it is
        if "shutterstock" in license or "under license" in license or "permission" in license or "getty" in license or "all rights reserved" in license:
            # TODO: determine when an image is critical to the flow of the text, and don't skip the entire flexbook if possible
            raise ContainsNonfreeImage()

        img["src"], _ = download_ck12_file(img["src"], destpath, filename=filename)

    for iframe in html.find_all("iframe"):
        # print("Skipping iframe: " + iframe.get("src"))
        iframe.extract()

    for node in html.find_all():

        # remove all id attributes
        if node.get("id"):
            del node["id"]

    # remove empty paragraph tags
    for p in html.find_all("p"):
        # if it has attributes, it might be needed for something
        if p.attrs:
            continue
        # if it has non-empty text nodes, don't remove
        if any(node.strip() for node in p.children if isinstance(node, str)):
            continue
        # if it has non-text nodes, don't remove
        if any(not isinstance(node, str) for node in p.children):
            continue
        # if we got this far, it's just a boring old empty p tag
        p.extract()

    # remove or update links
    for a in html.find_all("a"):
        url = a.get("href", "")
        # if urlparse(url).path.endswith(".pdf"):
        #     fileurlpath, response = download_ck12_file(url, destpath)
        #     if response.status_code == 200:
        #         a["href"] = fileurlpath
        #         a["target"] = "_blank"
        #         continue
        a.unwrap()

    # remove extra sections at end that we don't want
    for h3 in html.find_all("h3"):
        if h3.text in ["Review (Answers)", "Resources"]:
            node = h3
            while True:
                node, _ = node.nextSibling, node.extract()
                if not node or node.name == "h3" or str(node).strip().startswith("End inserted XHTML"):
                    break

    # insert the shared head and body resources into the html
    insert_codeblock_into_soup_node("resources/flex-headblock.html", html.head)
    insert_codeblock_into_soup_node("resources/flex-bodyblock.html", html.body)

    return html.prettify(encoding="ascii")


def insert_codeblock_into_soup_node(filepath, soupnode, position=-1):
    with open(filepath) as f:
        newsoup = BeautifulSoup(f.read(), "html.parser")

    for node in list(newsoup.children):
        soupnode.insert(position, node)


def perform_mockjax_api_downloads(filepath, domain_id, domain_handle, question_id, artifact_id):

    cookies = {"asmt-plix-trial": "1"}
    resp = download_and_mock_api_endpoint_with_mockjax("/assessment/api/get/info/test/plix%20practice/plixID/" + question_id, filepath, cookies=cookies)
    test_id = json.loads(resp)["response"]["test"]["_id"]
    download_and_mock_api_endpoint_with_mockjax("/assessment/api/render/questionInstance?evalData=True&includeConcepts=True&ans=True&qID=" + question_id, filepath, cookies=cookies)
    download_and_mock_api_endpoint_with_mockjax("/assessment/api/start/tests/{test_id}?instanceBundle=true&evalData=true&includePLIX=true".format(test_id=test_id), filepath, cookies=cookies)

    download_and_mock_api_endpoint_with_mockjax("/flx/get/minimal/modalities/{handle}?ownedBy=ck12&modalities=lesson".format(handle=domain_handle), filepath)
    download_and_mock_api_endpoint_with_mockjax("/flx/get/info/artifact/{a_id}".format(a_id=artifact_id), filepath)
    download_and_mock_api_endpoint_with_mockjax("/assessment/api/browse/info/questions/geometry-interactive?pageNum=1&pageSize=6&filters=encodedIDs," + domain_id, filepath)
    download_and_mock_api_endpoint_with_mockjax("/taxonomy/get/info/concept/" + domain_id, filepath)
    download_and_mock_api_endpoint_with_mockjax("/assessment/tools/geometry-tool/challengeMeTemp.html", filepath, mocked_url_pattern=".*challengeMeTemp\.html", wrap_in_string=True)


def download_and_mock_api_endpoint_with_mockjax(url, xhr_mock_path, mocked_url_pattern=None, middleware_callbacks=None, middleware_kwargs=None, cookies=None, wrap_in_string=False):

    # mocked_url_pattern is matched against request URLs to decide when to mock; if not set, use actual url's path
    mocked_url_pattern = mocked_url_pattern or (".*" + urlparse(url).path.replace("/", "\/") + ".*")

    # make the request to the URL
    content = make_request(make_fully_qualified_url(url), cookies=cookies).content.decode()

    # if there are any middleware callbacks, apply them to the content
    if middleware_callbacks:
        if not isinstance(middleware_callbacks, list):
            middleware_callbacks = [middleware_callbacks]
        for callback in middleware_callbacks:
            content = callback(content, **middleware_kwargs)

    divider = "// insertion point for more lines"

    if os.path.exists(xhr_mock_path):
        with open(xhr_mock_path, "r") as f:
            xhr_mock_text = f.read()
    else:
        xhr_mock_text = """
            define(["mockjax"], function(mockjax) {{
                {divider}
            }});
        """.format(divider=divider)

    if wrap_in_string:
        content = repr(content)

    with open(xhr_mock_path, "w") as f:

        mock_line = """
            $.mockjax({{url: /{url}/i, responseText: {content}}});
            """.format(url=mocked_url_pattern, content=content.replace("\n", " ").replace("\r", " "))

        xhr_mock_text = xhr_mock_text.replace(divider, mock_line + divider)

        f.write(xhr_mock_text)

    return content


def add_xhr_mocking_to_index(content, destpath, **kwargs):

    html = BeautifulSoup(content, "html.parser")

    # copy over the static resources that are needed
    xhr_deps = ["xhr.mock.js", "plix-xhr.mock.js"]
    for dep in xhr_deps:
        shutil.copy(os.path.join("resources", dep), destpath)

    # download necessary API endpoints
    downloaded_mock_filename = "xhr.mock.apidownloads.js"
    download_and_mock_api_endpoint_with_xhr("http://google.com/", destpath, downloaded_mock_filename, mocked_url="./build/././modalityAssign/js/templates/modal.info.html")

    # insert script tags into page
    scripts_to_insert = ["./" + filename for filename in xhr_deps + [downloaded_mock_filename]]
    for script in scripts_to_insert:
        html.head.append(html.new_tag("script", src=script))

    return str(html)


def download_and_mock_api_endpoint_with_xhr(url, destpath, filename, mocked_url=None, middleware_callbacks=None, middleware_kwargs=None):

    # mocked_url is the url to mach against requests to decide when to mock; if not set, use actual url
    mocked_url = mocked_url or url

    # make the request to the URL
    content = make_request(make_fully_qualified_url(url)).content.decode()

    # if there are any middleware callbacks, apply them to the content
    if middleware_callbacks:
        if not isinstance(middleware_callbacks, list):
            middleware_callbacks = [middleware_callbacks]
        for callback in middleware_callbacks:
            content = callback(content, **middleware_kwargs)

    xhr_mock_path = os.path.join(destpath, filename)

    with open(xhr_mock_path, "a") as f:

        mock_line = """
        xhr_mock.mock_conditionally({{url: "{url}"}},  {content});
        """.format(url=mocked_url, content=json.dumps(content))

        f.write(mock_line)

    return content
	def retrieve_flex_book_lesson(item):

	html_url = "/flx/get/perma/modality/lesson/{lesson_id}/{domain_id}?format=html".format(
	lesson_id=item["handle"],
	domain_id=item["domain"]["encodedID"],
	)

	dst = tempfile.mkdtemp()

	try:
	download_ck12_file(
	html_url,
	dst,
	filename="index.html",
	middleware_callbacks=[process_flex_book],
	)
	except TooManyLinksToBotherIncluding:
	return None
	except ContainsNonfreeImage:
	return None

	shutil.copy("resources/flex-styles.css", dst)
	download_mathjax(dst)

	# preview_in_browser(dst)

	zippath = create_predictable_zip(dst)
	node = HTML5AppNode(files=[HTMLZipFile(zippath)], **shared_node_attributes(item, title_suffix=" (Flexbook)"))

	return node


	def process_flex_book(content, baseurl, destpath, **kwargs):

	html = BeautifulSoup(content, "html.parser")

	# some pages are basically just a list of links; since we strip links out, no use keeping these
	ratio = len(" ".join([a.text for a in html.find_all("a")])) / len(html.text)
	if ratio > 0.1:
	raise TooManyLinksToBotherIncluding()

	# download all the images in the page, and replace the img src's to make them local
	for img in html.find_all("img"):
	if img.get("data-flx-url"):
	filename = urlparse(img.get("data-flx-url")).path.split("/")[-1]
	del img["data-flx-url"]
	else:
	filename = hashlib.md5(img["src"].encode()).hexdigest() + ".png"

	# check for license comments preceding the image
	license = ""
	prev = img.previous
	for i in range(8):
	if "@@license" in prev:
	license = prev.split('"')[1].lower()
	break
	prev = prev.previous
	next = img.next
	for i in range(8):
	if "@@license" in next:
	license = next.split('"')[1].lower()
	break
	next = next.next

	# check whether the image is marked as under a non-open license, and abort if it is
	if "shutterstock" in license or "under license" in license or "permission" in license or "getty" in license or "all rights reserved" in license:
	# TODO: determine when an image is critical to the flow of the text, and don't skip the entire flexbook if possible
	raise ContainsNonfreeImage()

	img["src"], _ = download_ck12_file(img["src"], destpath, filename=filename)

	for iframe in html.find_all("iframe"):
	# print("Skipping iframe: " + iframe.get("src"))
	iframe.extract()

	for node in html.find_all():

	# remove all id attributes
	if node.get("id"):
	del node["id"]

	# remove empty paragraph tags
	for p in html.find_all("p"):
	# if it has attributes, it might be needed for something
	if p.attrs:
	continue
	# if it has non-empty text nodes, don't remove
	if any(node.strip() for node in p.children if isinstance(node, str)):
	continue
	# if it has non-text nodes, don't remove
	if any(not isinstance(node, str) for node in p.children):
	continue
	# if we got this far, it's just a boring old empty p tag
	p.extract()

	# remove or update links
	for a in html.find_all("a"):
	url = a.get("href", "")
	# if urlparse(url).path.endswith(".pdf"):
	# fileurlpath, response = download_ck12_file(url, destpath)
	# if response.status_code == 200:
	# a["href"] = fileurlpath
	# a["target"] = "_blank"
	# continue
	a.unwrap()

	# remove extra sections at end that we don't want
	for h3 in html.find_all("h3"):
	if h3.text in ["Review (Answers)", "Resources"]:
	node = h3
	while True:
	node, _ = node.nextSibling, node.extract()
	if not node or node.name == "h3" or str(node).strip().startswith("End inserted XHTML"):
	break

	# insert the shared head and body resources into the html
	insert_codeblock_into_soup_node("resources/flex-headblock.html", html.head)
	insert_codeblock_into_soup_node("resources/flex-bodyblock.html", html.body)

	return html.prettify(encoding="ascii")


	def insert_codeblock_into_soup_node(filepath, soupnode, position=-1):
	with open(filepath) as f:
	newsoup = BeautifulSoup(f.read(), "html.parser")

	for node in list(newsoup.children):
	soupnode.insert(position, node)


	def perform_mockjax_api_downloads(filepath, domain_id, domain_handle, question_id, artifact_id):

	cookies = {"asmt-plix-trial": "1"}
	resp = download_and_mock_api_endpoint_with_mockjax("/assessment/api/get/info/test/plix%20practice/plixID/" + question_id, filepath, cookies=cookies)
	test_id = json.loads(resp)["response"]["test"]["_id"]
	download_and_mock_api_endpoint_with_mockjax("/assessment/api/render/questionInstance?evalData=True&includeConcepts=True&ans=True&qID=" + question_id, filepath, cookies=cookies)
	download_and_mock_api_endpoint_with_mockjax("/assessment/api/start/tests/{test_id}?instanceBundle=true&evalData=true&includePLIX=true".format(test_id=test_id), filepath, cookies=cookies)

	download_and_mock_api_endpoint_with_mockjax("/flx/get/minimal/modalities/{handle}?ownedBy=ck12&modalities=lesson".format(handle=domain_handle), filepath)
	download_and_mock_api_endpoint_with_mockjax("/flx/get/info/artifact/{a_id}".format(a_id=artifact_id), filepath)
	download_and_mock_api_endpoint_with_mockjax("/assessment/api/browse/info/questions/geometry-interactive?pageNum=1&pageSize=6&filters=encodedIDs," + domain_id, filepath)
	download_and_mock_api_endpoint_with_mockjax("/taxonomy/get/info/concept/" + domain_id, filepath)
	download_and_mock_api_endpoint_with_mockjax("/assessment/tools/geometry-tool/challengeMeTemp.html", filepath, mocked_url_pattern=".*challengeMeTemp\.html", wrap_in_string=True)


	def download_and_mock_api_endpoint_with_mockjax(url, xhr_mock_path, mocked_url_pattern=None, middleware_callbacks=None, middleware_kwargs=None, cookies=None, wrap_in_string=False):

	# mocked_url_pattern is matched against request URLs to decide when to mock; if not set, use actual url's path
	mocked_url_pattern = mocked_url_pattern or ("." + urlparse(url).path.replace("/", "\/") + ".")

	# make the request to the URL
	content = make_request(make_fully_qualified_url(url), cookies=cookies).content.decode()

	# if there are any middleware callbacks, apply them to the content
	if middleware_callbacks:
	if not isinstance(middleware_callbacks, list):
	middleware_callbacks = [middleware_callbacks]
	for callback in middleware_callbacks:
	content = callback(content, **middleware_kwargs)

	divider = "// insertion point for more lines"

	if os.path.exists(xhr_mock_path):
	with open(xhr_mock_path, "r") as f:
	xhr_mock_text = f.read()
	else:
	xhr_mock_text = """
	define(["mockjax"], function(mockjax) {{
	{divider}
	}});
	""".format(divider=divider)

	if wrap_in_string:
	content = repr(content)

	with open(xhr_mock_path, "w") as f:

	mock_line = """
	$.mockjax({{url: /{url}/i, responseText: {content}}});
	""".format(url=mocked_url_pattern, content=content.replace("\n", " ").replace("\r", " "))

	xhr_mock_text = xhr_mock_text.replace(divider, mock_line + divider)

	f.write(xhr_mock_text)

	return content


	def add_xhr_mocking_to_index(content, destpath, **kwargs):

	html = BeautifulSoup(content, "html.parser")

	# copy over the static resources that are needed
	xhr_deps = ["xhr.mock.js", "plix-xhr.mock.js"]
	for dep in xhr_deps:
	shutil.copy(os.path.join("resources", dep), destpath)

	# download necessary API endpoints
	downloaded_mock_filename = "xhr.mock.apidownloads.js"
	download_and_mock_api_endpoint_with_xhr("http://google.com/", destpath, downloaded_mock_filename, mocked_url="./build/././modalityAssign/js/templates/modal.info.html")

	# insert script tags into page
	scripts_to_insert = ["./" + filename for filename in xhr_deps + [downloaded_mock_filename]]
	for script in scripts_to_insert:
	html.head.append(html.new_tag("script", src=script))

	return str(html)



	def download_and_mock_api_endpoint_with_xhr(url, destpath, filename, mocked_url=None, middleware_callbacks=None, middleware_kwargs=None):

	# mocked_url is the url to mach against requests to decide when to mock; if not set, use actual url
	mocked_url = mocked_url or url

	# make the request to the URL
	content = make_request(make_fully_qualified_url(url)).content.decode()

	# if there are any middleware callbacks, apply them to the content
	if middleware_callbacks:
	if not isinstance(middleware_callbacks, list):
	middleware_callbacks = [middleware_callbacks]
	for callback in middleware_callbacks:
	content = callback(content, **middleware_kwargs)

	xhr_mock_path = os.path.join(destpath, filename)

	with open(xhr_mock_path, "a") as f:

	mock_line = """
	xhr_mock.mock_conditionally({{url: "{url}"}}, {content});
	""".format(url=mocked_url, content=json.dumps(content))

	f.write(mock_line)

	return content