Skip to content

Instantly share code, notes, and snippets.

@bachhuberdesign
Last active June 28, 2021 19:39
Show Gist options
  • Save bachhuberdesign/0dfc11cb4a5f959d65dcb914ee7c4dcf to your computer and use it in GitHub Desktop.
Save bachhuberdesign/0dfc11cb4a5f959d65dcb914ee7c4dcf to your computer and use it in GitHub Desktop.
Scraper for Laracasts videos -- requires active Laracasts subscription (see comments in code).
######################################################
# Laracasts Video Scraper #
# #
# Author: Eric Bachhuber #
# #
# Video files will be output to folder 'Laracasts' #
# wherever the script is run from. #
# #
# Possibly violates the terms of service, so use #
# at your own risk. #
# #
# Required: Active Laracasts subscription #
# Required: Python 3 #
# Required: BeautifulSoup (install with pip3) #
######################################################
from bs4 import BeautifulSoup
import urllib.request
import json
import requests
import re
import os
SCRIPT_VERSION = '0.0.1'
def main():
# Login to Laracasts via web and paste your laravel_session cookie here (inspect request header to find)
# Must be logged into a Laracasts account with an active subscription
cookies = {
'laravel_session': 'PASTE_YOUR_LARAVEL_SESSION_HERE'
}
# Get list of all categories with series via API (auth not required for this API call)
# Only non-archived series are returned from this endpoint.
# As of 10/9/2019, there are 79 "current" series and 19 "archived" series
seriesJson = requests.get('https://laracasts.com/api/series').json()
for category in seriesJson:
# As of 10/9/2019, Laracasts has 5 categories:
# Laravel, PHP, Testing, JavaScript, and Tooling
print("\nDownloading category: " + category)
for series in seriesJson[category]:
seriesTitle = sanitize_for_file_name(series['title'])
slug = series['slug']
print("\nPreparing to download series: " + seriesTitle)
episodeCounter = 1
while True:
episodeRequest = requests.get(
url='https://laracasts.com/series/' + slug + "/episodes/" + str(episodeCounter),
cookies=cookies,
allow_redirects=False # If an invalid episode number is used, Laracasts redirects to the series landing page
)
if episodeRequest.status_code == 302:
# Redirected, no remaining episodes for this series.
break
downloadLink = episodeRequest.text.split('download-link="')[1].split('"')[0]
# Grab episode title from <title> tag
soup = BeautifulSoup(episodeRequest.text, features="html.parser")
episodeName = sanitize_for_file_name(soup.title.string.replace(seriesTitle, ''))
episodePath = 'Laracasts/' + seriesTitle + "/Episode " + str(episodeCounter) + " - " + episodeName + ".mp4"
if not os.path.isdir('Laracasts'):
os.mkdir('Laracasts')
if not os.path.isdir('Laracasts/' + seriesTitle):
os.mkdir('Laracasts/' + seriesTitle)
if not os.path.exists(episodePath):
# Download episode and write to file
print("Downloading episode " + str(episodeCounter) + ": " + episodeName)
downloadRequest = requests.get(
url="https://www.laracasts.com" + downloadLink,
cookies=cookies,
allow_redirects=True
)
open(episodePath, 'wb').write(downloadRequest.content)
else:
# Episode already exists, skip download
print('Episode ' + episodeName + ' already exists, skipping.')
episodeCounter = episodeCounter + 1
pass
def sanitize_for_file_name(toSanitize):
remove_punctuation_map = dict((ord(char), None) for char in '\\/*?:"<>|\'')
return toSanitize.translate(remove_punctuation_map).strip()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment