Skip to content

Instantly share code, notes, and snippets.

@aydinemre
Created June 5, 2020 11:28
Show Gist options
  • Save aydinemre/62ec60c53ef552efc100981fecd50bad to your computer and use it in GitHub Desktop.
Save aydinemre/62ec60c53ef552efc100981fecd50bad to your computer and use it in GitHub Desktop.
import os
from urllib.request import urlretrieve
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
def get_video_content(content_url, persistence_path):
print(content_url)
if os.path.exists(persist_path):
return True
browser = webdriver.Chrome(ChromeDriverManager(log_level=0).install(), chrome_options=options)
browser.get(content_url)
try:
browser.switch_to.frame(browser.find_element_by_tag_name("iframe"))
video_url = browser.find_element_by_tag_name('video').get_attribute('src')
except NoSuchElementException:
# No video in content.
print("Video Yok")
browser.close()
return True
try:
urlretrieve(video_url, persistence_path)
except Exception as e:
print(e)
browser.close()
return False
browser.close()
return True
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.100 Safari/537.36'}
TR_URL = 'https://gelecegiyazanlar.turkcell.com.tr'
EGITIM_URL = '/egitimler'
course_links_file = 'course_links.csv'
if os.path.exists(course_links_file):
course_links = pd.read_csv(course_links_file)
else:
soup = BeautifulSoup(requests.get(TR_URL + EGITIM_URL, headers=headers).content, 'html.parser')
courses = [link.attrs['href'] for index, link in enumerate(soup.find_all(class_='m-card-sub-training__link')) if
index < 10]
sub_courses = []
for course in courses:
soup = BeautifulSoup(requests.get(course).content, 'html.parser')
course_sub_courses = [sub_course.attrs['href'] for sub_course in
soup.find_all(class_='a-btn a-btn--full a-btn--secondary a-btn--mid')]
sub_courses.extend(course_sub_courses)
course_links = pd.DataFrame(data=sub_courses, columns=['courseLinks'])
course_links.to_csv(course_links_file, index=False)
main_folder = 'videos'
os.makedirs(main_folder)
for course_link in course_links['courseLinks']:
print(course_link)
content_dir = os.path.join(main_folder, course_link.split('/')[4], course_link.split('/')[6])
content_file = os.path.join(content_dir, 'content.csv')
os.makedirs(content_dir, exist_ok=True)
if os.path.exists(content_file):
contents = pd.read_csv(content_file)
else:
page = BeautifulSoup(requests.get(course_link, headers=headers).content, 'html.parser')
contents = pd.DataFrame(data=[(content.attrs.get('id'), TR_URL + content.attrs['href']) for content in
page.find(class_='m-aside-menu__list').find_all('a', href=True)],
columns=['id', 'link']).drop_duplicates()
contents['isDownloaded'] = False
contents = contents.sort_values(by='id')
contents.to_csv(content_file, index=False)
for index, (url_id, url, _) in contents[contents['isDownloaded'] == 0].iterrows():
persist_path = os.path.join(content_dir, str(url_id) + "_" + url.split('/')[-1] + '.mp4')
retVal = get_video_content(url, persist_path)
print('{} {} {}'.format(url, persist_path, retVal))
contents.loc[contents['link'] == url, 'isDownloaded'] = retVal
contents.to_csv(content_file, index=False)
beautifulsoup4==4.9.1
certifi==2020.4.5.1
chardet==3.0.4
colorama==0.4.3
configparser==5.0.0
crayons==0.3.0
idna==2.9
numpy==1.18.4
pandas==1.0.4
python-dateutil==2.8.1
pytz==2020.1
requests==2.23.0
selenium==3.141.0
six==1.15.0
soupsieve==2.0.1
urllib3==1.25.9
webdriver-manager==2.5.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment