rohitkg98/horriblesubs_batch_download.py

## horriblesubs_batch_download.py
import os
import requests
import argparse
from html.parser import HTMLParser

parser = argparse.ArgumentParser(description='Input Link required.')

parser.add_argument('root_url', help='Link to the download page of Anime.')

def get_api_url(root_url):
    base_url = 'https://horriblesubs.info/api.php?method=getshows&type=show&showid='
    resp = requests.get(root_url)
    # showid is of format 'var showid = ###;'
    # first get find show id in the html and we remove the part after the ';'
    # then we split at =, giving us ['showid' , '=', '###'], last element is the show id
    showid = resp.text[resp.text.find('showid'):]\
            .split(';')[0]\
            .split('=')[-1]\
            .strip()
    # add showid to base url and return it
    return base_url + showid

def get_links_page(api_url):
    full_text = ''
    page_counter = 0
    extended_url = f"{api_url}&nextid={page_counter}"
    resp = requests.get(extended_url)

    while resp.text != 'DONE':
        full_text += resp.text
        page_counter += 1
        extended_url = f"{api_url}&nextid={page_counter}"
        resp = requests.get(extended_url)

    return full_text

def parse_and_save(parser, html_text, file_name):
    rel_path = os.path.join(os.getcwd(), file_name)
    os.mkdir(rel_path)

    parser.feed(html_text)
    for key, value in parser.data.items():
        os.mkdir(os.path.join(rel_path, key))
        for types, links in value.items():
            with open(os.path.join(rel_path, key, types), 'w') as file:
                for link in links:
                    file.write(f'{link} \n')

# HTML Parser for Horrible Subs Download page
class HorribleSubsParser(HTMLParser):
    tags = []
    current_category = '480p'
    data = {
        '480p': {},
        '720p': {},
        '1080p': {}
    }
    def handle_starttag(self, tag, attrs):
        if len(attrs) > 1 and tag == 'a':
            if attrs[0][1] in self.data[self.current_category].keys():
                self.data[self.current_category][attrs[0][1]].append(attrs[1][1])
            else:
                self.data[self.current_category][attrs[0][1]] = [attrs[1][1]]

    def handle_data(self, data):
        if self.current_category not in data:
            if '480p' in data:
                self.current_category = '480p'
            if '720p' in data:
                self.current_category = '720p'
            if '1080p' in data:
                self.current_category = '1080p'

if __name__ == "__main__":
    args = parser.parse_args()

    # get api url
    api_url = get_api_url(args.root_url)

    # get the html page in string form
    full_html = get_links_page(api_url)

    # create parser object and parse the html
    html_parser = HorribleSubsParser()
    show_name = args.root_url.split('/')[-1]
    if not show_name:
        show_name = args.root_url.split('/')[-2]

    parse_and_save(html_parser, full_html, show_name)
	import os
	import requests
	import argparse
	from html.parser import HTMLParser

	parser = argparse.ArgumentParser(description='Input Link required.')

	parser.add_argument('root_url', help='Link to the download page of Anime.')

	def get_api_url(root_url):
	base_url = 'https://horriblesubs.info/api.php?method=getshows&type=show&showid='
	resp = requests.get(root_url)
	# showid is of format 'var showid = ###;'
	# first get find show id in the html and we remove the part after the ';'
	# then we split at =, giving us ['showid' , '=', '###'], last element is the show id
	showid = resp.text[resp.text.find('showid'):]\
	.split(';')[0]\
	.split('=')[-1]\
	.strip()
	# add showid to base url and return it
	return base_url + showid

	def get_links_page(api_url):
	full_text = ''
	page_counter = 0
	extended_url = f"{api_url}&nextid={page_counter}"
	resp = requests.get(extended_url)

	while resp.text != 'DONE':
	full_text += resp.text
	page_counter += 1
	extended_url = f"{api_url}&nextid={page_counter}"
	resp = requests.get(extended_url)

	return full_text

	def parse_and_save(parser, html_text, file_name):
	rel_path = os.path.join(os.getcwd(), file_name)
	os.mkdir(rel_path)

	parser.feed(html_text)
	for key, value in parser.data.items():
	os.mkdir(os.path.join(rel_path, key))
	for types, links in value.items():
	with open(os.path.join(rel_path, key, types), 'w') as file:
	for link in links:
	file.write(f'{link} \n')

	# HTML Parser for Horrible Subs Download page
	class HorribleSubsParser(HTMLParser):
	tags = []
	current_category = '480p'
	data = {
	'480p': {},
	'720p': {},
	'1080p': {}
	}
	def handle_starttag(self, tag, attrs):
	if len(attrs) > 1 and tag == 'a':
	if attrs[0][1] in self.data[self.current_category].keys():
	self.data[self.current_category][attrs[0][1]].append(attrs[1][1])
	else:
	self.data[self.current_category][attrs[0][1]] = [attrs[1][1]]

	def handle_data(self, data):
	if self.current_category not in data:
	if '480p' in data:
	self.current_category = '480p'
	if '720p' in data:
	self.current_category = '720p'
	if '1080p' in data:
	self.current_category = '1080p'

	if __name__ == "__main__":
	args = parser.parse_args()

	# get api url
	api_url = get_api_url(args.root_url)

	# get the html page in string form
	full_html = get_links_page(api_url)

	# create parser object and parse the html
	html_parser = HorribleSubsParser()
	show_name = args.root_url.split('/')[-1]
	if not show_name:
	show_name = args.root_url.split('/')[-2]

	parse_and_save(html_parser, full_html, show_name)