Skip to content

Instantly share code, notes, and snippets.

@razasyedh
Last active August 29, 2015 14:12
Show Gist options
  • Save razasyedh/212a4e7724be1a3fe18d to your computer and use it in GitHub Desktop.
Save razasyedh/212a4e7724be1a3fe18d to your computer and use it in GitHub Desktop.
A defunct script for scraping download URL's from madokami.
#!/usr/bin/env python
"""decode.py
A python tool that decodes the obfuscated URL's on madokami pages and
outputs them to a file.
Dependencies: requests, beautifulsoup4
Usage: decode.py URL outputfile.txt
Note: This is no longer necessary/functional because madokami doesn't obfuscate their URL's anymore.
"""
import sys
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
import warnings
def validate_url(url):
"""Make sure the URL is from the right website and subdomain."""
network_location = "manga.madokami.com"
parsed_url = urlparse(url)
if parsed_url.netloc != network_location:
print "Error: Input is not a madokami link."
sys.exit(1)
return
def get_webpage(url):
"""Get the contents of the page and return a BeautifulSoup object of the
page.
"""
agent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"
http_headers = {"user-agent": agent}
print "Getting Webpage..."
# We are making an unverified SSL connection because madokami's SSL
# certificate only seems to match madokami.com
warnings.simplefilter("ignore") # Ignore urllib3's InsecureRequestWarning
page = requests.get(url, verify=False, auth=(USERNAME, PASSWORD),
headers=http_headers)
return_code = page.status_code
if return_code == 401: # unauthorized
print "Error: Please make sure your credentials are correct."
sys.exit(1)
if not page.ok:
print "Error: The server returned a %d status code." % return_code
sys.exit(1)
# debugfile = open('/tmp/madokami.html', 'w')
# debugfile.write(page.text)
# debugfile.close()
print "Parsing Webpage..."
souped_page = BeautifulSoup(page.content)
# Make sure madokami didn't just return the front page
if souped_page.title.string == "/ - /a/ manga":
print "Error: Please make sure your URL points to a proper series."
sys.exit(1)
warnings.resetwarnings()
return souped_page
def string_to_list(string):
"""Takes a string of comma-delimited numbers and converts them to a list
of ints.
"""
num_list = string.split(',')
num_list = [int(i) for i in num_list]
return num_list
def get_table(webpage):
"""Find the table in the page source and return the table as a list."""
# Expects: <div class="index-container" data-table="[int,int ... ,int]">
table = webpage.find("div", {"class":"index-container"})["data-table"]
table = table[1:-1] # remove brackets around data
lookup_table = string_to_list(table)
return lookup_table
def get_urls(webpage):
"""Find and return all encoded urls to download in the page source."""
# Expects: <a href="" data-enc="{"url":[int,int, ... ,int]}"></a>
encoded_links = webpage.find_all("a", {"href":""})
link_list = []
for link in encoded_links:
link = link["data-enc"][8:-2] # Remove brackets + cruft
link_list.append(link)
return link_list
def decode_urls(encoded_urls, lookup_table):
"""Decode each url one by one and return a decoded list."""
url_list = []
url_base = "https://manga.madokami.com"
for url in encoded_urls:
single_url_list = string_to_list(url)
single_url = url_base
for character in single_url_list:
# Here's where the magic happens. First, we XOR the character in
# the URL to the decimal 51. Then we use the resulting number as
# the index in the lookup table to get the actual ascii value of
# the character. Finally we convert that value to ascii.
ascii_num = character ^ 0x33
ascii_code = lookup_table[ascii_num]
ascii_letter = chr(ascii_code)
single_url += ascii_letter
url_list.append(single_url)
print "Found %d URL's" % len(url_list)
return url_list
def write_urls(url_list, output):
"""Write out the urls to a file."""
out_file = open(output, 'w')
for url in url_list:
out_file.write("%s\n" % url)
out_file.close()
return
def main():
""" Get the input url, validate it, and get the webpage it points to. Then
parse that page to obtain the download urls.
"""
if len(sys.argv) == 3:
input_url = sys.argv[1]
output_file = sys.argv[2]
else:
print "Usage: ./decode.py URL URL_FILE"
sys.exit(1)
validate_url(input_url)
webpage = get_webpage(input_url)
lookup_table = get_table(webpage)
encoded_urls = get_urls(webpage)
decoded_urls = decode_urls(encoded_urls, lookup_table)
write_urls(decoded_urls, output_file)
sys.exit(0)
if __name__ == "__main__":
USERNAME = ""
PASSWORD = ""
try:
main()
except KeyboardInterrupt:
print "User sent keyboard interrupt. Exiting..."
exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment