Last active
July 9, 2022 14:03
-
-
Save clamytoe/2729a2616ef02412593a5777448ad341 to your computer and use it in GitHub Desktop.
This script will simplify the extracting of urls for recorded video sessions from any of the CareerERA tracks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""CareerERA Video Extractor | |
This script will simplify the extracting of urls for recorded video sessions from any | |
of the CareerERA tracks. | |
How to use: | |
1. Save this script to your computer. | |
2. Go to your dashboard and navigate to the "Live Virtual Class" section. | |
3. Click on the "View" link to the Day of the session that you want to get. | |
4. Under the Vides section, click on the link to the pre-recorded session. | |
5. From the popup frame, right-click in the middle and select "View frame source". | |
6. Ctrl+A to select all. | |
7. Ctrl+C to copy the source code. | |
8. Close the source tab. | |
9. Close the video frame by clicking on the X on the top right. | |
10. Scroll back up and click on the "Go back!" button. | |
11. In the same directory where you saved this script, create a new file and Ctrl+V | |
to paste the code into it. | |
12. Ctrl+S to save the file, name it temp.html. | |
13. Open up a shell or command prompt and navigate to where you have these files. | |
14. Run the script: python cera_vid_extract.py | |
15. Use a download manager to download the video. | |
Repeat for all of the sessions that you want to save. | |
My favorite method to download these is with wget and it would look something like this: | |
wget -c https://url-to-the-video.mp4 -O 'Day 1 - 20211219.m4' | |
Explanation: | |
wget: is the program I'm using. It's available on Unix/Linux and Mac's by default. Windows | |
users will have to install it - http://gnuwin32.sourceforge.net/packages/wget.htm | |
-c: Continuation flag. If it gets disconnected it will attempt to continue where it left | |
off. | |
-O: Output as. Allows you to save the file with the specified filename. | |
""" | |
from pathlib import Path | |
from bs4 import BeautifulSoup # type: ignore | |
def soupify(source: Path) -> BeautifulSoup: | |
"""Parses contents of source into a BeautifulSoup object | |
Args: | |
source (Path): Path object to system file containing the HTML to parse | |
Returns: | |
BeautifulSoup: Data structure representing a parsed HTML document | |
""" | |
if source.exists: | |
print(f"[ ] Parsing {source}...") | |
return BeautifulSoup(source.read_text(), "html.parser") | |
print(f"{source} does not exist!") | |
exit() | |
def extract_script_tags(soup: BeautifulSoup) -> str: | |
"""Extracts script tags from the BeautifulSoup object | |
Args: | |
soup (BeautifulSoup): Data structure representing a parsed HTML document | |
Returns: | |
str: The code from the script tag containing the urls of the videos | |
""" | |
print(f"[ ] Extracting all script tags...") | |
script_tags = soup.find_all("script") | |
return script_tags[2].text | |
def extract_code_vars(tags: str) -> list: | |
"""Extracts the variables from the script tag code | |
Args: | |
tags (str): The code from the script tags | |
Returns: | |
list: A list of the key/value string representation of the variables | |
""" | |
print(f"[ ] Extracting key/value pairs from code...") | |
code_lines = tags.split("; ") | |
return code_lines[0].split(",") | |
def extract_url(size: int, code: list) -> str: | |
"""Extracts the url from the list of code variables | |
Args: | |
size (int): The size of the video that we want the url for | |
code (list): List containing the variables from the script tag | |
Returns: | |
str: The url for the specified video | |
""" | |
print(f"[ ] Searching for video url with {size} width...") | |
pattern = f'"width":{size}' | |
hirez = False | |
for vars in code: | |
if hirez: | |
if "url" in vars: | |
url = vars.split('":"')[1] | |
url = url.replace('"', "") | |
hirez = False | |
else: | |
if pattern in vars: | |
hirez = True | |
try: | |
return url | |
except UnboundLocalError: | |
# print("No URL was found!") | |
pass | |
def main(): | |
"""Main entry point for the script""" | |
file = Path("temp.html") | |
soup = soupify(file) | |
tags = extract_script_tags(soup) | |
code = extract_code_vars(tags) | |
url = extract_url(1280, code) | |
# high resolution video not found, get a lower one | |
if url is None: | |
url = extract_url(640, code) | |
print(f"[ ] Found: {url}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On day 48 CareerERA did not provide a link for a high-resolution video, so my script failed. I've added an
if
statement to download a lower resolution one in case that happens again.