Skip to content

Instantly share code, notes, and snippets.

@rohitkg98
Created November 3, 2019 18:26
Show Gist options
  • Save rohitkg98/ffde7b2dddba4e86ca5521f661c19c06 to your computer and use it in GitHub Desktop.
Save rohitkg98/ffde7b2dddba4e86ca5521f661c19c06 to your computer and use it in GitHub Desktop.
import os
import requests
import argparse
from html.parser import HTMLParser
parser = argparse.ArgumentParser(description='Input Link required.')
parser.add_argument('root_url', help='Link to the download page of Anime.')
def get_api_url(root_url):
base_url = 'https://horriblesubs.info/api.php?method=getshows&type=show&showid='
resp = requests.get(root_url)
# showid is of format 'var showid = ###;'
# first get find show id in the html and we remove the part after the ';'
# then we split at =, giving us ['showid' , '=', '###'], last element is the show id
showid = resp.text[resp.text.find('showid'):]\
.split(';')[0]\
.split('=')[-1]\
.strip()
# add showid to base url and return it
return base_url + showid
def get_links_page(api_url):
full_text = ''
page_counter = 0
extended_url = f"{api_url}&nextid={page_counter}"
resp = requests.get(extended_url)
while resp.text != 'DONE':
full_text += resp.text
page_counter += 1
extended_url = f"{api_url}&nextid={page_counter}"
resp = requests.get(extended_url)
return full_text
def parse_and_save(parser, html_text, file_name):
rel_path = os.path.join(os.getcwd(), file_name)
os.mkdir(rel_path)
parser.feed(html_text)
for key, value in parser.data.items():
os.mkdir(os.path.join(rel_path, key))
for types, links in value.items():
with open(os.path.join(rel_path, key, types), 'w') as file:
for link in links:
file.write(f'{link} \n')
# HTML Parser for Horrible Subs Download page
class HorribleSubsParser(HTMLParser):
tags = []
current_category = '480p'
data = {
'480p': {},
'720p': {},
'1080p': {}
}
def handle_starttag(self, tag, attrs):
if len(attrs) > 1 and tag == 'a':
if attrs[0][1] in self.data[self.current_category].keys():
self.data[self.current_category][attrs[0][1]].append(attrs[1][1])
else:
self.data[self.current_category][attrs[0][1]] = [attrs[1][1]]
def handle_data(self, data):
if self.current_category not in data:
if '480p' in data:
self.current_category = '480p'
if '720p' in data:
self.current_category = '720p'
if '1080p' in data:
self.current_category = '1080p'
if __name__ == "__main__":
args = parser.parse_args()
# get api url
api_url = get_api_url(args.root_url)
# get the html page in string form
full_html = get_links_page(api_url)
# create parser object and parse the html
html_parser = HorribleSubsParser()
show_name = args.root_url.split('/')[-1]
if not show_name:
show_name = args.root_url.split('/')[-2]
parse_and_save(html_parser, full_html, show_name)
@rohitkg98
Copy link
Author

Just provide the URL to the Show as the first argument

@sameerp98
Copy link

Sugoi.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment