Skip to content

Instantly share code, notes, and snippets.

@simonseo
Created March 18, 2020 13:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save simonseo/56a794138a08164f7e0a052ed7d4f854 to your computer and use it in GitHub Desktop.
Save simonseo/56a794138a08164f7e0a052ed7d4f854 to your computer and use it in GitHub Desktop.
패스트캠퍼스 강좌를 다운로드 받기 위한 스크레이퍼
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @File Name: scrapper.py
# @Created: 2020-03-18 02:57:12 Simon Myunggun Seo (simon.seo@nyu.edu)
# @Updated: 2020-03-18 17:08:14 Simon Seo (simon.seo@nyu.edu)
import sys, time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from contextlib import contextmanager
import requests, json
USERNAME = "blah@gmail.com"
PASSWORD = "mypassword"
COURSE_URL = "https://www.fastcampus.co.kr/courses/200328/clips/4591"
VIDEO_COUNT = 5
@contextmanager
def headlessDriver():
options = webdriver.ChromeOptions()
# options.add_argument('headless')
driver = webdriver.Chrome(chrome_options=options)
driver.implicitly_wait(10)
yield driver
driver.close()
@contextmanager
def point_to(driver, url):
driver.get(url)
driver.implicitly_wait(4)
yield driver
driver.back()
driver.implicitly_wait(4)
def authenticate(driver):
# Normal Auth
print("Authenticating Fastcampus")
username_box = driver.find_element_by_name("email")
username_box.clear(); username_box.send_keys(USERNAME)
password_box = driver.find_element_by_name("password")
password_box.clear(); password_box.send_keys(PASSWORD)
password_box.send_keys(Keys.RETURN)
time.sleep(5)
def get_video_info(driver: webdriver.Chrome):
print("Retrieving Video Info from {}".format(driver.current_url))
# get title here
title_tag = driver.find_elements_by_class_name('fco-lecture-hall-header__title')[0].find_elements_by_css_selector("*")[-1]
title = title_tag.text
while title == "재생중인 강의가 없습니다.":
driver.implicitly_wait(1)
title_tag = driver.find_elements_by_class_name('fco-lecture-hall-header__title')[0].find_elements_by_css_selector("*")[-1]
title = title_tag.text
iframe = driver.find_elements_by_class_name("fco-kollus-video__viewer")[0]
iframe_src = iframe.get_attribute('src')
print(iframe_src)
with point_to(driver, iframe_src):
print(driver.find_element_by_id('kollus_player_html5_api'))
video_src = driver.find_element_by_id('kollus_player_html5_api').get_attribute('src')
return title, video_src
def load_next_video(driver):
next_button = driver.find_elements_by_class_name("fco-video-controller__play-control")[0].find_elements_by_css_selector("*")[-1]
webdriver.ActionChains(driver).move_to_element(next_button).click(next_button).perform() # click button even if it's hidden
driver.implicitly_wait(4)
def download(file_name, link):
with open(file_name, "wb") as f:
print("Downloading video \"{}\"".format(file_name))
response = requests.get(link, stream=True)
total_length = response.headers.get('content-length')
if total_length is None: # no content length header
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
print("\nFinished Downloading video \"{}\"".format(file_name))
def run():
with headlessDriver() as driver:
try:
# log in
driver.get(COURSE_URL)
if "패스트캠퍼스" in driver.title:
authenticate(driver)
# download
for _ in range(VIDEO_COUNT):
load_next_video(driver)
title, video_src = get_video_info(driver)
download(title+".mp4", video_src)
except Exception as e:
driver.close()
raise e
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment