Skip to content

Instantly share code, notes, and snippets.

@hrishikeshrt
Last active September 7, 2020 21:17
Show Gist options
  • Save hrishikeshrt/dc0ff37667e46f72fff83a00a54f9e5c to your computer and use it in GitHub Desktop.
Save hrishikeshrt/dc0ff37667e46f72fff83a00a54f9e5c to your computer and use it in GitHub Desktop.
Download course contents from sanskritfromhome.in
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 11 17:25:53 2019
Updated on Sun Aug 30 17:23:43 2020
@author: Hrishikesh Terdalkar
Requirements:
pip install requests
pip install beautifulsoup4
pip install requests-downloader
"""
import re
import os
import sys
import stat
import getpass
import argparse
import requests
from bs4 import BeautifulSoup
from requests_downloader.downloader import download as download_file
###############################################################################
SERVER = 'www.sanskritfromhome.in'
###############################################################################
def extract_course_id(course_url):
try:
not_url = '/' not in course_url
sfh_pattern = '(http(s|)://|)(www.|)sanskritfromhome.in/course/([^/]*)'
sfh_match = re.match(sfh_pattern, course_url)
if not_url:
course_id = course_url
else:
if sfh_match:
course_id = sfh_match.group(len(sfh_match.groups()))
else:
print("Invalid course URL.")
return None
except Exception:
print("Invalid course URL.")
return None
course_id = course_id.split('#')[0]
return course_id
###############################################################################
class Vyoma():
home_url = f'https://{SERVER}'
login_url = f'https://{SERVER}/wp-admin/admin-ajax.php'
def __init__(self, username, password, download_dir=None):
'''
Vyoma downloader class
Parameters
----------
username : str
Username.
password : str
Password.
download_dir : str, optional
Location in which the course content will be downloaded.
The default is None.
'''
self.session = requests.Session()
self.username = username
self.password = password
self.logged_in = False
self.links = {}
self.descriptions = {}
# download directory
self.download_dir = download_dir
if not download_dir:
home_dir = os.path.expanduser('~')
vyoma_dir = os.path.join(home_dir, 'vyoma', username)
self.download_dir = vyoma_dir
if not os.path.isdir(self.download_dir):
os.makedirs(self.download_dir)
def get_course_url(self, course_id):
'''Build course URL from course ID'''
return f'{self.home_url}/course/{course_id}'
def login(self):
'''Login to sanskritfromhome.in'''
r = self.get(self.home_url)
# check if already logged in
if 'Sign Out' in r.text and 'Sign In' not in r.text:
self.logged_in = True
return self.logged_in
data = {
'user_login': self.username,
'user_password': self.password,
'user_action': 'login_user',
'action': 'themex_update_user'
}
soup = BeautifulSoup(r.text, 'html.parser')
nonce = soup.find('input', attrs={'name': 'nonce'})['value']
data['nonce'] = nonce
r = self.post(self.login_url, data=data)
# check if it succeeded
r = self.get(self.home_url)
self.logged_in = ('Sign Out' in r.text and 'Sign In' not in r.text)
return self.logged_in
def fetch_course_page(self, course_id):
'''Get contents of the course page
Parameters
----------
course_id : str
Course ID.
Raises
------
RuntimeError
If called without a valid login.
Returns
-------
html
HTML content of the course page.
'''
self.login()
if not self.logged_in:
raise RuntimeError("Requires a sign-in.")
r = self.get(self.get_course_url(course_id))
return r.text
def fetch_course_links(self, course_id, html=''):
'''
Fetch all links of downloadable content for a course.
Parameters
----------
course_id : str
Course ID.
html : str, optional
Instead of a course ID, a pre-downloaded HTML can be specified.
The default is ''.
Returns
-------
links : dict
Links to audio, video and documents from the course.
'''
if not html:
html = self.fetch_course_page(course_id)
soup = BeautifulSoup(html, 'html.parser')
links = {}
links['audio'] = soup.find_all('a', attrs={'class': 'audio'})
links['video'] = soup.find_all('a', attrs={'class': 'video'})
links['document'] = soup.find_all('a', attrs={'class': 'document'})
self.links[course_id] = links
return self.links[course_id]
def fetch_course_description(self, course_id, html=''):
'''
Fetch course description
Parameters
----------
course_id : str
Course ID.
html : str, optional
Instead of a course ID, a pre-downloaded HTML can be specified.
The default is ''.
Returns
-------
description : str
HTML description of the course
'''
if not html:
html = self.fetch_course_page(course_id)
soup = BeautifulSoup(html, 'html.parser')
description_div = soup.find('div', class_='course-description')
self.descriptions[course_id] = str(description_div)
return self.descriptions[course_id]
def download_course_content(self, course_id,
document=True, audio=True, video=True):
'''
Download course content (audios, documents and video-links)
Parameters
----------
course_id : str
Course ID.
document : bool, optional
If True, download document links.
The default is True.
audio : bool, optional
If True, download audio links.
The default is True.
video : bool, optional
If True, download video links.
The default is True.
Returns
-------
status : bool
True if the download funcion completed successfully.
Does not mean that all the files were downloaded successfully.
'''
html = self.fetch_course_page(course_id)
course_dir = os.path.join(self.download_dir, course_id)
if not os.path.isdir(course_dir):
os.mkdir(course_dir)
if course_id not in self.descriptions:
self.fetch_course_description(course_id, html=html)
with open(os.path.join(course_dir, 'description.html'), 'w') as f:
f.write(self.descriptions[course_id])
print("Saved course description.")
if course_id not in self.links:
self.fetch_course_links(course_id, html=html)
links = self.links[course_id]
if video:
video_file = os.path.join(course_dir, 'video_links.txt')
video_links = [link['href'] for link in links['video']]
with open(video_file, 'w') as f:
f.write('\n'.join(video_links))
download = []
if document:
download.append('document')
if audio:
download.append('audio')
all_skipped_links = []
for dl in download:
# save links
dl_links_file = os.path.join(course_dir, f'{dl}_links.txt')
with open(dl_links_file, 'w') as f:
f.write('\n'.join([link['href'] for link in links[dl]]))
dl_dir = os.path.join(course_dir, dl)
if not os.path.isdir(dl_dir):
os.mkdir(dl_dir)
downloaded = 0
skipped = 0
skipped_links = []
print(f"Total {dl.title()}s:", len(links[dl]))
for link in links[dl]:
success = True
try:
download_file(link['href'], download_dir=dl_dir,
session=self.session, verbose=True)
except Exception as e:
print(e)
success = False
if success:
downloaded += 1
else:
skipped += 1
skipped_links.append(link['href'])
print(f"Skipping {link['href']}")
print(f"Successfully downloaded {downloaded} {dl} files.")
if skipped:
print(f"Could not download {skipped} {dl} files.")
print(f"Skipped {dl.title()} URLs: {skipped_links}")
if all_skipped_links:
skipped_file = os.path.join(course_dir, 'skipped_links.txt')
with open(skipped_file, 'w') as f:
f.write('\n'.join(all_skipped_links))
return True
def download_course_audios(self, course_id):
'''Wrapper to download only audio links'''
return self.download_course_content(course_id, audio=True,
document=False, video=False)
def download_course_video_links(self, course_id):
'''Wrapper to download only video links'''
return self.download_course_content(course_id, video=True,
document=False, audio=False)
def download_course_documents(self, course_id):
'''Wrapper to download only document links'''
return self.download_course_content(course_id, document=True,
audio=False, video=False)
def set_user(self, username):
self.username = username
def set_pass(self, password):
self.password = password
def get(self, *args, **kwargs):
return self.session.get(*args, **kwargs)
def post(self, *args, **kwargs):
return self.session.post(*args, **kwargs)
def __repr__(self):
return f"Vyoma(username={self.username}, logged_in={self.logged_in})"
###############################################################################
def main():
desc = "Download course contents from 'sanskritfromhome.in'."
p = argparse.ArgumentParser(description=desc)
p.add_argument("course-url", help="URL of the relevant course")
p.add_argument("-v", "--video", action='store_true',
help="Download video links only")
p.add_argument("-a", "--audio", action='store_true',
help="Download audios only")
p.add_argument("-d", "--document", action='store_true',
help="Download documents only")
p.add_argument("-o", "--output", default=None,
help="Path of the directory to download content to.")
p.add_argument("-u", "--username", default=None)
p.add_argument("-p", "--password", default=None)
args = vars(p.parse_args())
# download options
course_url = args['course-url']
audio = args['audio']
video = args['video']
document = args['document']
output = args['output']
# credentials
config = {}
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, '.vyoma.cnf')
if os.path.isfile(config_file):
with open(config_file) as f:
lines = f.read().split('\n')
for line in lines:
if line.strip():
key, value = line.split('=')
config[key.strip()] = value.strip()
username = (
config.get('username') or
os.environ.get('VYOMA_USER') or
args['username']
)
password = (
config.get('password') or
os.environ.get('VYOMA_PASS') or
args['password']
)
manual = not (username and password)
if not username:
username = input('Username: ')
if not password:
password = getpass.getpass('Password: ')
username = username.strip()
password = password.strip()
# action
vyoma = Vyoma(username=username, password=password, download_dir=output)
if not vyoma.login():
print("Error: could not sign in.")
return 1
if manual:
answer = input("Save credentials for future use? (Y/n)")
if not answer or answer.lower()[0] == 'y':
with open(config_file, 'w') as f:
f.write(f"username = {username}\n"
f"password = {password}")
print("Credentials saved!")
os.chmod(config_file, stat.S_IREAD + stat.S_IWRITE)
if vyoma.logged_in:
course_id = extract_course_id(course_url)
vyoma.fetch_course_links(course_id)
if not(any([audio, video, document])):
vyoma.download_course_content(course_id)
else:
vyoma.download_course_content(course_id,
document=document,
audio=audio,
video=video)
return 0
###############################################################################
if __name__ == '__main__':
sys.exit(main())
###############################################################################
@hrishikeshrt
Copy link
Author

Separated the download logic into a separate function download_file().
Added support for Google Drive links.

@hrishikeshrt
Copy link
Author

Added support for resuming partial downloads and ignoring the files that have already been downloaded.
Downloads course-description too now.
Can now accept various forms of URLs (with or without http, www etc) or can simply provide course ID (Part after /course/ from the URL)

@hrishikeshrt
Copy link
Author

Release as pip installable package.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment