clamytoe/cera_vid_extract.py

## cera_vid_extract.py
"""CareerERA Video Extractor

This script will simplify the extracting of urls for recorded video sessions from any
of the CareerERA tracks.

How to use:

1. Save this script to your computer.
2. Go to your dashboard and navigate to the "Live Virtual Class" section.
3. Click on the "View" link to the Day of the session that you want to get.
4. Under the Vides section, click on the link to the pre-recorded session.
5. From the popup frame, right-click in the middle and select "View frame source".
6. Ctrl+A to select all.
7. Ctrl+C to copy the source code.
8. Close the source tab.
9. Close the video frame by clicking on the X on the top right.
10. Scroll back up and click on the "Go back!" button.
11. In the same directory where you saved this script, create a new file and Ctrl+V
    to paste the code into it.
12. Ctrl+S to save the file, name it temp.html.
13. Open up a shell or command prompt and navigate to where you have these files.
14. Run the script: python cera_vid_extract.py
15. Use a download manager to download the video.

Repeat for all of the sessions that you want to save.
My favorite method to download these is with wget and it would look something like this:

  wget -c https://url-to-the-video.mp4 -O 'Day 1 - 20211219.m4'

Explanation:
  wget: is the program I'm using. It's available on Unix/Linux and Mac's by default. Windows
        users will have to install it - http://gnuwin32.sourceforge.net/packages/wget.htm
    -c: Continuation flag. If it gets disconnected it will attempt to continue where it left
        off.
    -O: Output as. Allows you to save the file with the specified filename.
"""
from pathlib import Path

from bs4 import BeautifulSoup  # type: ignore


def soupify(source: Path) -> BeautifulSoup:
    """Parses contents of source into a BeautifulSoup object

    Args:
        source (Path): Path object to system file containing the HTML to parse

    Returns:
        BeautifulSoup: Data structure representing a parsed HTML document
    """
    if source.exists:
        print(f"[ ] Parsing {source}...")
        return BeautifulSoup(source.read_text(), "html.parser")
    print(f"{source} does not exist!")
    exit()


def extract_script_tags(soup: BeautifulSoup) -> str:
    """Extracts script tags from the BeautifulSoup object

    Args:
        soup (BeautifulSoup): Data structure representing a parsed HTML document

    Returns:
        str: The code from the script tag containing the urls of the videos
    """
    print(f"[ ] Extracting all script tags...")
    script_tags = soup.find_all("script")
    return script_tags[2].text


def extract_code_vars(tags: str) -> list:
    """Extracts the variables from the script tag code

    Args:
        tags (str): The code from the script tags

    Returns:
        list: A list of the key/value string representation of the variables
    """
    print(f"[ ] Extracting key/value pairs from code...")
    code_lines = tags.split("; ")
    return code_lines[0].split(",")


def extract_url(size: int, code: list) -> str:
    """Extracts the url from the list of code variables

    Args:
        size (int): The size of the video that we want the url for
        code (list): List containing the variables from the script tag

    Returns:
        str: The url for the specified video
    """
    print(f"[ ] Searching for video url with {size} width...")
    pattern = f'"width":{size}'
    hirez = False
    for vars in code:
        if hirez:
            if "url" in vars:
                url = vars.split('":"')[1]
                url = url.replace('"', "")
                hirez = False
        else:
            if pattern in vars:
                hirez = True
    try:
        return url
    except UnboundLocalError:
        # print("No URL was found!")
        pass


def main():
    """Main entry point for the script"""
    file = Path("temp.html")
    soup = soupify(file)
    tags = extract_script_tags(soup)
    code = extract_code_vars(tags)
    url = extract_url(1280, code)
    # high resolution video not found, get a lower one
    if url is None:
        url = extract_url(640, code)
    print(f"[ ] Found: {url}")


if __name__ == "__main__":
    main()
	"""CareerERA Video Extractor

	This script will simplify the extracting of urls for recorded video sessions from any
	of the CareerERA tracks.

	How to use:

	1. Save this script to your computer.
	2. Go to your dashboard and navigate to the "Live Virtual Class" section.
	3. Click on the "View" link to the Day of the session that you want to get.
	4. Under the Vides section, click on the link to the pre-recorded session.
	5. From the popup frame, right-click in the middle and select "View frame source".
	6. Ctrl+A to select all.
	7. Ctrl+C to copy the source code.
	8. Close the source tab.
	9. Close the video frame by clicking on the X on the top right.
	10. Scroll back up and click on the "Go back!" button.
	11. In the same directory where you saved this script, create a new file and Ctrl+V
	to paste the code into it.
	12. Ctrl+S to save the file, name it temp.html.
	13. Open up a shell or command prompt and navigate to where you have these files.
	14. Run the script: python cera_vid_extract.py
	15. Use a download manager to download the video.

	Repeat for all of the sessions that you want to save.
	My favorite method to download these is with wget and it would look something like this:

	wget -c https://url-to-the-video.mp4 -O 'Day 1 - 20211219.m4'

	Explanation:
	wget: is the program I'm using. It's available on Unix/Linux and Mac's by default. Windows
	users will have to install it - http://gnuwin32.sourceforge.net/packages/wget.htm
	-c: Continuation flag. If it gets disconnected it will attempt to continue where it left
	off.
	-O: Output as. Allows you to save the file with the specified filename.
	"""
	from pathlib import Path

	from bs4 import BeautifulSoup # type: ignore


	def soupify(source: Path) -> BeautifulSoup:
	"""Parses contents of source into a BeautifulSoup object

	Args:
	source (Path): Path object to system file containing the HTML to parse

	Returns:
	BeautifulSoup: Data structure representing a parsed HTML document
	"""
	if source.exists:
	print(f"[ ] Parsing {source}...")
	return BeautifulSoup(source.read_text(), "html.parser")
	print(f"{source} does not exist!")
	exit()


	def extract_script_tags(soup: BeautifulSoup) -> str:
	"""Extracts script tags from the BeautifulSoup object

	Args:
	soup (BeautifulSoup): Data structure representing a parsed HTML document

	Returns:
	str: The code from the script tag containing the urls of the videos
	"""
	print(f"[ ] Extracting all script tags...")
	script_tags = soup.find_all("script")
	return script_tags[2].text


	def extract_code_vars(tags: str) -> list:
	"""Extracts the variables from the script tag code

	Args:
	tags (str): The code from the script tags

	Returns:
	list: A list of the key/value string representation of the variables
	"""
	print(f"[ ] Extracting key/value pairs from code...")
	code_lines = tags.split("; ")
	return code_lines[0].split(",")


	def extract_url(size: int, code: list) -> str:
	"""Extracts the url from the list of code variables

	Args:
	size (int): The size of the video that we want the url for
	code (list): List containing the variables from the script tag

	Returns:
	str: The url for the specified video
	"""
	print(f"[ ] Searching for video url with {size} width...")
	pattern = f'"width":{size}'
	hirez = False
	for vars in code:
	if hirez:
	if "url" in vars:
	url = vars.split('":"')[1]
	url = url.replace('"', "")
	hirez = False
	else:
	if pattern in vars:
	hirez = True
	try:
	return url
	except UnboundLocalError:
	# print("No URL was found!")
	pass


	def main():
	"""Main entry point for the script"""
	file = Path("temp.html")
	soup = soupify(file)
	tags = extract_script_tags(soup)
	code = extract_code_vars(tags)
	url = extract_url(1280, code)
	# high resolution video not found, get a lower one
	if url is None:
	url = extract_url(640, code)
	print(f"[ ] Found: {url}")


	if __name__ == "__main__":
	main()