Last active
August 29, 2015 14:12
-
-
Save razasyedh/212a4e7724be1a3fe18d to your computer and use it in GitHub Desktop.
A defunct script for scraping download URL's from madokami.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""decode.py | |
A python tool that decodes the obfuscated URL's on madokami pages and | |
outputs them to a file. | |
Dependencies: requests, beautifulsoup4 | |
Usage: decode.py URL outputfile.txt | |
Note: This is no longer necessary/functional because madokami doesn't obfuscate their URL's anymore. | |
""" | |
import sys | |
import requests | |
from bs4 import BeautifulSoup | |
from urlparse import urlparse | |
import warnings | |
def validate_url(url): | |
"""Make sure the URL is from the right website and subdomain.""" | |
network_location = "manga.madokami.com" | |
parsed_url = urlparse(url) | |
if parsed_url.netloc != network_location: | |
print "Error: Input is not a madokami link." | |
sys.exit(1) | |
return | |
def get_webpage(url): | |
"""Get the contents of the page and return a BeautifulSoup object of the | |
page. | |
""" | |
agent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0" | |
http_headers = {"user-agent": agent} | |
print "Getting Webpage..." | |
# We are making an unverified SSL connection because madokami's SSL | |
# certificate only seems to match madokami.com | |
warnings.simplefilter("ignore") # Ignore urllib3's InsecureRequestWarning | |
page = requests.get(url, verify=False, auth=(USERNAME, PASSWORD), | |
headers=http_headers) | |
return_code = page.status_code | |
if return_code == 401: # unauthorized | |
print "Error: Please make sure your credentials are correct." | |
sys.exit(1) | |
if not page.ok: | |
print "Error: The server returned a %d status code." % return_code | |
sys.exit(1) | |
# debugfile = open('/tmp/madokami.html', 'w') | |
# debugfile.write(page.text) | |
# debugfile.close() | |
print "Parsing Webpage..." | |
souped_page = BeautifulSoup(page.content) | |
# Make sure madokami didn't just return the front page | |
if souped_page.title.string == "/ - /a/ manga": | |
print "Error: Please make sure your URL points to a proper series." | |
sys.exit(1) | |
warnings.resetwarnings() | |
return souped_page | |
def string_to_list(string): | |
"""Takes a string of comma-delimited numbers and converts them to a list | |
of ints. | |
""" | |
num_list = string.split(',') | |
num_list = [int(i) for i in num_list] | |
return num_list | |
def get_table(webpage): | |
"""Find the table in the page source and return the table as a list.""" | |
# Expects: <div class="index-container" data-table="[int,int ... ,int]"> | |
table = webpage.find("div", {"class":"index-container"})["data-table"] | |
table = table[1:-1] # remove brackets around data | |
lookup_table = string_to_list(table) | |
return lookup_table | |
def get_urls(webpage): | |
"""Find and return all encoded urls to download in the page source.""" | |
# Expects: <a href="" data-enc="{"url":[int,int, ... ,int]}"></a> | |
encoded_links = webpage.find_all("a", {"href":""}) | |
link_list = [] | |
for link in encoded_links: | |
link = link["data-enc"][8:-2] # Remove brackets + cruft | |
link_list.append(link) | |
return link_list | |
def decode_urls(encoded_urls, lookup_table): | |
"""Decode each url one by one and return a decoded list.""" | |
url_list = [] | |
url_base = "https://manga.madokami.com" | |
for url in encoded_urls: | |
single_url_list = string_to_list(url) | |
single_url = url_base | |
for character in single_url_list: | |
# Here's where the magic happens. First, we XOR the character in | |
# the URL to the decimal 51. Then we use the resulting number as | |
# the index in the lookup table to get the actual ascii value of | |
# the character. Finally we convert that value to ascii. | |
ascii_num = character ^ 0x33 | |
ascii_code = lookup_table[ascii_num] | |
ascii_letter = chr(ascii_code) | |
single_url += ascii_letter | |
url_list.append(single_url) | |
print "Found %d URL's" % len(url_list) | |
return url_list | |
def write_urls(url_list, output): | |
"""Write out the urls to a file.""" | |
out_file = open(output, 'w') | |
for url in url_list: | |
out_file.write("%s\n" % url) | |
out_file.close() | |
return | |
def main(): | |
""" Get the input url, validate it, and get the webpage it points to. Then | |
parse that page to obtain the download urls. | |
""" | |
if len(sys.argv) == 3: | |
input_url = sys.argv[1] | |
output_file = sys.argv[2] | |
else: | |
print "Usage: ./decode.py URL URL_FILE" | |
sys.exit(1) | |
validate_url(input_url) | |
webpage = get_webpage(input_url) | |
lookup_table = get_table(webpage) | |
encoded_urls = get_urls(webpage) | |
decoded_urls = decode_urls(encoded_urls, lookup_table) | |
write_urls(decoded_urls, output_file) | |
sys.exit(0) | |
if __name__ == "__main__": | |
USERNAME = "" | |
PASSWORD = "" | |
try: | |
main() | |
except KeyboardInterrupt: | |
print "User sent keyboard interrupt. Exiting..." | |
exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment