Skip to content

Instantly share code, notes, and snippets.

@ivanistheone
Created August 20, 2020 10:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivanistheone/ed654d3ccff7ef9f52891e056beaaf4e to your computer and use it in GitHub Desktop.
Save ivanistheone/ed654d3ccff7ef9f52891e056beaaf4e to your computer and use it in GitHub Desktop.
def retrieve_flex_book_lesson(item):
html_url = "/flx/get/perma/modality/lesson/{lesson_id}/{domain_id}?format=html".format(
lesson_id=item["handle"],
domain_id=item["domain"]["encodedID"],
)
dst = tempfile.mkdtemp()
try:
download_ck12_file(
html_url,
dst,
filename="index.html",
middleware_callbacks=[process_flex_book],
)
except TooManyLinksToBotherIncluding:
return None
except ContainsNonfreeImage:
return None
shutil.copy("resources/flex-styles.css", dst)
download_mathjax(dst)
# preview_in_browser(dst)
zippath = create_predictable_zip(dst)
node = HTML5AppNode(files=[HTMLZipFile(zippath)], **shared_node_attributes(item, title_suffix=" (Flexbook)"))
return node
def process_flex_book(content, baseurl, destpath, **kwargs):
html = BeautifulSoup(content, "html.parser")
# some pages are basically just a list of links; since we strip links out, no use keeping these
ratio = len(" ".join([a.text for a in html.find_all("a")])) / len(html.text)
if ratio > 0.1:
raise TooManyLinksToBotherIncluding()
# download all the images in the page, and replace the img src's to make them local
for img in html.find_all("img"):
if img.get("data-flx-url"):
filename = urlparse(img.get("data-flx-url")).path.split("/")[-1]
del img["data-flx-url"]
else:
filename = hashlib.md5(img["src"].encode()).hexdigest() + ".png"
# check for license comments preceding the image
license = ""
prev = img.previous
for i in range(8):
if "@@license" in prev:
license = prev.split('"')[1].lower()
break
prev = prev.previous
next = img.next
for i in range(8):
if "@@license" in next:
license = next.split('"')[1].lower()
break
next = next.next
# check whether the image is marked as under a non-open license, and abort if it is
if "shutterstock" in license or "under license" in license or "permission" in license or "getty" in license or "all rights reserved" in license:
# TODO: determine when an image is critical to the flow of the text, and don't skip the entire flexbook if possible
raise ContainsNonfreeImage()
img["src"], _ = download_ck12_file(img["src"], destpath, filename=filename)
for iframe in html.find_all("iframe"):
# print("Skipping iframe: " + iframe.get("src"))
iframe.extract()
for node in html.find_all():
# remove all id attributes
if node.get("id"):
del node["id"]
# remove empty paragraph tags
for p in html.find_all("p"):
# if it has attributes, it might be needed for something
if p.attrs:
continue
# if it has non-empty text nodes, don't remove
if any(node.strip() for node in p.children if isinstance(node, str)):
continue
# if it has non-text nodes, don't remove
if any(not isinstance(node, str) for node in p.children):
continue
# if we got this far, it's just a boring old empty p tag
p.extract()
# remove or update links
for a in html.find_all("a"):
url = a.get("href", "")
# if urlparse(url).path.endswith(".pdf"):
# fileurlpath, response = download_ck12_file(url, destpath)
# if response.status_code == 200:
# a["href"] = fileurlpath
# a["target"] = "_blank"
# continue
a.unwrap()
# remove extra sections at end that we don't want
for h3 in html.find_all("h3"):
if h3.text in ["Review (Answers)", "Resources"]:
node = h3
while True:
node, _ = node.nextSibling, node.extract()
if not node or node.name == "h3" or str(node).strip().startswith("End inserted XHTML"):
break
# insert the shared head and body resources into the html
insert_codeblock_into_soup_node("resources/flex-headblock.html", html.head)
insert_codeblock_into_soup_node("resources/flex-bodyblock.html", html.body)
return html.prettify(encoding="ascii")
def insert_codeblock_into_soup_node(filepath, soupnode, position=-1):
with open(filepath) as f:
newsoup = BeautifulSoup(f.read(), "html.parser")
for node in list(newsoup.children):
soupnode.insert(position, node)
def perform_mockjax_api_downloads(filepath, domain_id, domain_handle, question_id, artifact_id):
cookies = {"asmt-plix-trial": "1"}
resp = download_and_mock_api_endpoint_with_mockjax("/assessment/api/get/info/test/plix%20practice/plixID/" + question_id, filepath, cookies=cookies)
test_id = json.loads(resp)["response"]["test"]["_id"]
download_and_mock_api_endpoint_with_mockjax("/assessment/api/render/questionInstance?evalData=True&includeConcepts=True&ans=True&qID=" + question_id, filepath, cookies=cookies)
download_and_mock_api_endpoint_with_mockjax("/assessment/api/start/tests/{test_id}?instanceBundle=true&evalData=true&includePLIX=true".format(test_id=test_id), filepath, cookies=cookies)
download_and_mock_api_endpoint_with_mockjax("/flx/get/minimal/modalities/{handle}?ownedBy=ck12&modalities=lesson".format(handle=domain_handle), filepath)
download_and_mock_api_endpoint_with_mockjax("/flx/get/info/artifact/{a_id}".format(a_id=artifact_id), filepath)
download_and_mock_api_endpoint_with_mockjax("/assessment/api/browse/info/questions/geometry-interactive?pageNum=1&pageSize=6&filters=encodedIDs," + domain_id, filepath)
download_and_mock_api_endpoint_with_mockjax("/taxonomy/get/info/concept/" + domain_id, filepath)
download_and_mock_api_endpoint_with_mockjax("/assessment/tools/geometry-tool/challengeMeTemp.html", filepath, mocked_url_pattern=".*challengeMeTemp\.html", wrap_in_string=True)
def download_and_mock_api_endpoint_with_mockjax(url, xhr_mock_path, mocked_url_pattern=None, middleware_callbacks=None, middleware_kwargs=None, cookies=None, wrap_in_string=False):
# mocked_url_pattern is matched against request URLs to decide when to mock; if not set, use actual url's path
mocked_url_pattern = mocked_url_pattern or (".*" + urlparse(url).path.replace("/", "\/") + ".*")
# make the request to the URL
content = make_request(make_fully_qualified_url(url), cookies=cookies).content.decode()
# if there are any middleware callbacks, apply them to the content
if middleware_callbacks:
if not isinstance(middleware_callbacks, list):
middleware_callbacks = [middleware_callbacks]
for callback in middleware_callbacks:
content = callback(content, **middleware_kwargs)
divider = "// insertion point for more lines"
if os.path.exists(xhr_mock_path):
with open(xhr_mock_path, "r") as f:
xhr_mock_text = f.read()
else:
xhr_mock_text = """
define(["mockjax"], function(mockjax) {{
{divider}
}});
""".format(divider=divider)
if wrap_in_string:
content = repr(content)
with open(xhr_mock_path, "w") as f:
mock_line = """
$.mockjax({{url: /{url}/i, responseText: {content}}});
""".format(url=mocked_url_pattern, content=content.replace("\n", " ").replace("\r", " "))
xhr_mock_text = xhr_mock_text.replace(divider, mock_line + divider)
f.write(xhr_mock_text)
return content
def add_xhr_mocking_to_index(content, destpath, **kwargs):
html = BeautifulSoup(content, "html.parser")
# copy over the static resources that are needed
xhr_deps = ["xhr.mock.js", "plix-xhr.mock.js"]
for dep in xhr_deps:
shutil.copy(os.path.join("resources", dep), destpath)
# download necessary API endpoints
downloaded_mock_filename = "xhr.mock.apidownloads.js"
download_and_mock_api_endpoint_with_xhr("http://google.com/", destpath, downloaded_mock_filename, mocked_url="./build/././modalityAssign/js/templates/modal.info.html")
# insert script tags into page
scripts_to_insert = ["./" + filename for filename in xhr_deps + [downloaded_mock_filename]]
for script in scripts_to_insert:
html.head.append(html.new_tag("script", src=script))
return str(html)
def download_and_mock_api_endpoint_with_xhr(url, destpath, filename, mocked_url=None, middleware_callbacks=None, middleware_kwargs=None):
# mocked_url is the url to mach against requests to decide when to mock; if not set, use actual url
mocked_url = mocked_url or url
# make the request to the URL
content = make_request(make_fully_qualified_url(url)).content.decode()
# if there are any middleware callbacks, apply them to the content
if middleware_callbacks:
if not isinstance(middleware_callbacks, list):
middleware_callbacks = [middleware_callbacks]
for callback in middleware_callbacks:
content = callback(content, **middleware_kwargs)
xhr_mock_path = os.path.join(destpath, filename)
with open(xhr_mock_path, "a") as f:
mock_line = """
xhr_mock.mock_conditionally({{url: "{url}"}}, {content});
""".format(url=mocked_url, content=json.dumps(content))
f.write(mock_line)
return content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment