Skip to content

Instantly share code, notes, and snippets.

@arunisrael
Last active December 11, 2015 14:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save arunisrael/4616677 to your computer and use it in GitHub Desktop.
Save arunisrael/4616677 to your computer and use it in GitHub Desktop.
Khan Academy Video To Exercise Mapper
Python script to generate markdown formatted list of Khan Academy videos that need exercises.
To run yourself:
1. Download khan_academy_video_library.html from Dropbox or follow the instructions in the khan_academy_video_library.txt to generate this file yourself.
2. Make sure you have Python 2.7+ and BeautifulSoup4 installed.
3. Run "python KAVideoToExercise.py"
from bs4 import BeautifulSoup
import re
import urllib2
class KAVideoToExercise:
BASE_URL = 'http://www.khanacademy.org'
PRACTICE_BUTTON_TAG = 'a'
VIDEO_TITLE_TAG = 'span'
def __init__(self, initial_data):
for key in initial_data:
setattr(self, key, initial_data[key])
self.file_soup = ""
self.video_links = []
self.video_dictionary = {}
self.videos_wo_exercises = {}
self.video_list = ""
def perform_mapping(self):
self.read_input_file()
self.find_video_links()
self.build_video_dictionary()
self.check_videos_for_exercises()
self.output_markdown()
def read_input_file(self):
self.file_soup = BeautifulSoup(open(self.input_file).read())
def find_video_links(self):
self.video_links = self.file_soup.find_all(
href=re.compile(self.video_filter), class_=self.video_class)
def build_video_dictionary(self):
for link in self.video_links:
title = link.find(self.VIDEO_TITLE_TAG, class_=re.compile(
self.video_title_class)).contents[0].string
href = link['href']
self.video_dictionary[title] = href
def check_videos_for_exercises(self):
for key in self.video_dictionary.iterkeys():
full_url = self.BASE_URL + self.video_dictionary[key]
try:
raw = urllib2.urlopen(full_url).read()
except urllib2.HTTPError:
pass
soup = BeautifulSoup(raw)
has_practice = soup.find(
self.PRACTICE_BUTTON_TAG, class_=self.practice_class)
if (has_practice is None):
self.videos_wo_exercises[key] = full_url
def output_markdown(self):
for key in sorted(self.videos_wo_exercises.iterkeys()):
line = "[" + key + "]" + "(" + self.videos_wo_exercises[key] + ")"
print(line.encode('utf-8'))
INPUT_FILE = "khan_academy_video_library.html"
VIDEO_FILTER = "/math"
VIDEO_CLASS = 'vid-progress'
VIDEO_TITLE_CLASS = "progress-title"
PRACTICE_CLASS = "practice"
mapper = KAVideoToExercise(
{
"input_file": INPUT_FILE,
"video_filter": VIDEO_FILTER,
"video_class": VIDEO_CLASS,
"video_title_class": VIDEO_TITLE_CLASS,
"practice_class": PRACTICE_CLASS
})
mapper.perform_mapping()
# The dropbox link below contains the generated HTML source for https://www.khanacademy.org/library
# To get the HTML source yourself
# - Open https://www.khanacademy.org/library in Google Chrome
# - Right click anywhere on the page and select 'Inspect Element'
# - Right click on the <html> node at the top of the Elements Browswer and select 'Copy as HTML'
# - Paste into your favorite text editor and save
https://dl.dropbox.com/u/46010896/khan_academy_video_library.html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment