Skip to content

Instantly share code, notes, and snippets.

@Gowee
Created March 1, 2019 17:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gowee/d844370d5be6353e190554df5f9d48aa to your computer and use it in GitHub Desktop.
Save Gowee/d844370d5be6353e190554df5f9d48aa to your computer and use it in GitHub Desktop.
Get video links from chinesemooc.org
#!/usr/bin/env python3
from time import sleep
from collections import namedtuple
import json, re, os, csv
from urllib.parse import parse_qsl, urljoin, urlparse, urlencode
from requests import Session
from requests.cookies import create_cookie
parse_qs = lambda *args, **kwargs: dict(parse_qsl(*args, **kwargs))
Info = namedtuple("CourseInfo", ["name", "teacher"])
Lesson = namedtuple("Lesson", ["id", "no", "name"])
LessonWare = namedtuple("LessonWare", ["id", "name", "type", "lesson_id"])
VIDEO = '1'
ASGMT = '2'
FILE = '4'
BASE_URL = "http://www.chinesemooc.org"
class Course():
def __init__(self, cookies: str, course_url: str):
self.session = Session()
cquery = parse_qs(course_url)
self.course = {
'kvideoid': cquery['kvideoid'],
'classesid': cquery['classesid']
}
for cookie in cookies.split(";"):
for k, v in parse_qs(cookie.strip()).items():
#print(k ,v)
self.session.cookies.set_cookie(create_cookie(name=k, value=v))
_REGEX_COURSE_NAME = re.compile(r'<p class="course-info-title">\s*(.+)\s*</p>')
_REGEX_COURSE_TEACHER = re.compile(r'<strong class="user-name">\s*(.+)\s*</strong>')
def info(self):
url = urljoin(BASE_URL, "/kvideo.php?do=course_progress&" + urlencode(self.course))
data = self.session.get(url).text
#print(url, data)
name = self._REGEX_COURSE_NAME.search(data).group(1)
teacher = self._REGEX_COURSE_TEACHER.search(data).group(1)
return Info(name, teacher)
def lessons(self):
url = urljoin(BASE_URL, "/api/player_course_list.php")
data = json.loads(self.session.get(url + "?" + urlencode(self.course)).text)
for lesson in data['msg']:
yield Lesson(lesson['course_id'], lesson['order_num_str'], lesson['course_name'])
def lesson_wares(self, lesson: Lesson):
url = urljoin(BASE_URL, "/course.php?ac=course_live&op=live&")
data = self.session.get(url + urlencode({'course_id': lesson.id})).text
data = json.loads(data)
#print(data)
for ware in data['msg']['list']:
yield LessonWare(ware['eid'], ware['subject'], ware['type'], lesson.id)
def get_video(self, video: LessonWare):
url = urljoin(BASE_URL, "/api/course_video_watch.php")
url = url + "?" + urlencode({'course_id': video.lesson_id, 'eid': video.id})
data = self.session.get(url).text
#print(url, data)
data = json.loads(data)
videos = {'origin': data['msg']['mp4_url']}
for _, video in data['msg']['transcode_succ_list']:
videos[video['file_type']] = video['file_path']
return videos
def main():
cookies = input("Cookies: ")
curl = input("Course URL with kvideoid and classesid: ")
interval = int(input("Interval: ") or 5)
course = Course(cookies, curl)
info = course.info()
print(info)
fpath = f"./{info.name} - {info.teacher}.csv"
if os.path.exists(fpath):
exit("File already existed: " + fpath)
with open(fpath, "w") as f:
w = csv.writer(f)
w.writerow("#, Lesson, Name, Origin, LD, SD, HD, Others".split(", "))
for lesson in course.lessons():
print(lesson)
for ware in course.lesson_wares(lesson):
print(ware)
if ware.type == VIDEO:
video = course.get_video(ware)
row = f"{lesson.no}, {lesson.name}, {ware.name}".split(", ")
row.append(video.pop('origin', ""))
row.append(video.pop('LD', ""))
row.append(video.pop('SD', ""))
row.append(video.pop('HD', ""))
if video:
others = []
for _, other in video.items():
others.append(other)
row.append(" ".join(others))
w.writerow(row)
sleep(interval)
else:
print("(Ignored 1 course ware)")
sleep(interval)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment