bachhuberdesign/laracasts_scraper.py

## laracasts_scraper.py
######################################################
# Laracasts Video Scraper                            #
#                                                    #
# Author: Eric Bachhuber                             #
#                                                    #
# Video files will be output to folder 'Laracasts'   #
# wherever the script is run from.                   #
#                                                    #
# Possibly violates the terms of service, so use     #
# at your own risk.                                  #
#                                                    #
# Required: Active Laracasts subscription            #
# Required: Python 3                                 #
# Required: BeautifulSoup (install with pip3)        #
######################################################
from bs4 import BeautifulSoup
import urllib.request
import json
import requests
import re
import os


SCRIPT_VERSION = '0.0.1'


def main():
    # Login to Laracasts via web and paste your laravel_session cookie here (inspect request header to find)
    # Must be logged into a Laracasts account with an active subscription
    cookies = {
        'laravel_session': 'PASTE_YOUR_LARAVEL_SESSION_HERE'
    }

    # Get list of all categories with series via API (auth not required for this API call)
    # Only non-archived series are returned from this endpoint.
    # As of 10/9/2019, there are 79 "current" series and 19 "archived" series
    seriesJson = requests.get('https://laracasts.com/api/series').json()

    for category in seriesJson:
        # As of 10/9/2019, Laracasts has 5 categories:
        # Laravel, PHP, Testing, JavaScript, and Tooling
        print("\nDownloading category: " + category)

        for series in seriesJson[category]:
            seriesTitle = sanitize_for_file_name(series['title'])
            slug = series['slug']

            print("\nPreparing to download series: " + seriesTitle)

            episodeCounter = 1
            while True:
                episodeRequest = requests.get(
                    url='https://laracasts.com/series/' + slug + "/episodes/" + str(episodeCounter),
                    cookies=cookies,
                    allow_redirects=False # If an invalid episode number is used, Laracasts redirects to the series landing page
                )

                if episodeRequest.status_code == 302:
                    # Redirected, no remaining episodes for this series.
                    break

                downloadLink = episodeRequest.text.split('download-link="')[1].split('"')[0]

                # Grab episode title from <title> tag
                soup = BeautifulSoup(episodeRequest.text, features="html.parser")
                episodeName = sanitize_for_file_name(soup.title.string.replace(seriesTitle, ''))

                episodePath = 'Laracasts/' + seriesTitle + "/Episode " + str(episodeCounter) + " - " + episodeName + ".mp4"

                if not os.path.isdir('Laracasts'):
                    os.mkdir('Laracasts')

                if not os.path.isdir('Laracasts/' + seriesTitle):
                    os.mkdir('Laracasts/' + seriesTitle)

                if not os.path.exists(episodePath):
                    # Download episode and write to file
                    print("Downloading episode " + str(episodeCounter) + ": " + episodeName)

                    downloadRequest = requests.get(
                        url="https://www.laracasts.com" + downloadLink,
                        cookies=cookies,
                        allow_redirects=True
                    )

                    open(episodePath, 'wb').write(downloadRequest.content)
                else:
                    # Episode already exists, skip download
                    print('Episode ' + episodeName + ' already exists, skipping.')

                episodeCounter = episodeCounter + 1

                pass


def sanitize_for_file_name(toSanitize):
    remove_punctuation_map = dict((ord(char), None) for char in '\\/*?:"<>|\'')
    return toSanitize.translate(remove_punctuation_map).strip()


if __name__ == "__main__":
    main()
	######################################################
	# Laracasts Video Scraper #
	# #
	# Author: Eric Bachhuber #
	# #
	# Video files will be output to folder 'Laracasts' #
	# wherever the script is run from. #
	# #
	# Possibly violates the terms of service, so use #
	# at your own risk. #
	# #
	# Required: Active Laracasts subscription #
	# Required: Python 3 #
	# Required: BeautifulSoup (install with pip3) #
	######################################################
	from bs4 import BeautifulSoup
	import urllib.request
	import json
	import requests
	import re
	import os


	SCRIPT_VERSION = '0.0.1'


	def main():
	# Login to Laracasts via web and paste your laravel_session cookie here (inspect request header to find)
	# Must be logged into a Laracasts account with an active subscription
	cookies = {
	'laravel_session': 'PASTE_YOUR_LARAVEL_SESSION_HERE'
	}

	# Get list of all categories with series via API (auth not required for this API call)
	# Only non-archived series are returned from this endpoint.
	# As of 10/9/2019, there are 79 "current" series and 19 "archived" series
	seriesJson = requests.get('https://laracasts.com/api/series').json()

	for category in seriesJson:
	# As of 10/9/2019, Laracasts has 5 categories:
	# Laravel, PHP, Testing, JavaScript, and Tooling
	print("\nDownloading category: " + category)

	for series in seriesJson[category]:
	seriesTitle = sanitize_for_file_name(series['title'])
	slug = series['slug']

	print("\nPreparing to download series: " + seriesTitle)

	episodeCounter = 1
	while True:
	episodeRequest = requests.get(
	url='https://laracasts.com/series/' + slug + "/episodes/" + str(episodeCounter),
	cookies=cookies,
	allow_redirects=False # If an invalid episode number is used, Laracasts redirects to the series landing page
	)

	if episodeRequest.status_code == 302:
	# Redirected, no remaining episodes for this series.
	break

	downloadLink = episodeRequest.text.split('download-link="')[1].split('"')[0]

	# Grab episode title from <title> tag
	soup = BeautifulSoup(episodeRequest.text, features="html.parser")
	episodeName = sanitize_for_file_name(soup.title.string.replace(seriesTitle, ''))

	episodePath = 'Laracasts/' + seriesTitle + "/Episode " + str(episodeCounter) + " - " + episodeName + ".mp4"

	if not os.path.isdir('Laracasts'):
	os.mkdir('Laracasts')

	if not os.path.isdir('Laracasts/' + seriesTitle):
	os.mkdir('Laracasts/' + seriesTitle)

	if not os.path.exists(episodePath):
	# Download episode and write to file
	print("Downloading episode " + str(episodeCounter) + ": " + episodeName)

	downloadRequest = requests.get(
	url="https://www.laracasts.com" + downloadLink,
	cookies=cookies,
	allow_redirects=True
	)

	open(episodePath, 'wb').write(downloadRequest.content)
	else:
	# Episode already exists, skip download
	print('Episode ' + episodeName + ' already exists, skipping.')

	episodeCounter = episodeCounter + 1

	pass


	def sanitize_for_file_name(toSanitize):
	remove_punctuation_map = dict((ord(char), None) for char in '\\/*?:"<>\|\'')
	return toSanitize.translate(remove_punctuation_map).strip()


	if __name__ == "__main__":
	main()