Skip to content

Instantly share code, notes, and snippets.

@oneamitj
Last active March 29, 2016 07:59
Show Gist options
  • Save oneamitj/509566d7eaef39269689 to your computer and use it in GitHub Desktop.
Save oneamitj/509566d7eaef39269689 to your computer and use it in GitHub Desktop.
download all chapters of the given comics, url of hellocomic.com
#!/usr/bin/python3
import urllib.request, urllib.error, urllib.parse
from pdb import set_trace
from bs4 import BeautifulSoup
import os
url = input('Enter url from hellocomic.com: ')
# url = 'http://www.hellocomic.com/miles-morales-ultimate-spider-man/c1/p1'
# hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
# 'Accept-Encoding': 'none',
# 'Accept-Language': 'en-US,en;q=0.8',
# 'Connection': 'keep-alive'}
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
headers = { 'User-Agent' : user_agent }
req = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(req)
soup_page = BeautifulSoup(response.read())
chapters = []
for chapter in soup_page.find(id ='e2').find_all('option'):
chapters.append(chapter.get('value'))
for chapter in chapters:
html_page = BeautifulSoup(urllib.request.urlopen(urllib.request.Request(chapter, None, headers)).read())
pages = range(len(html_page.find(id='e1').find_all('option')))
# set_trace()
folder = html_page.title.string.split(' - Read')[0]
os.system('mkdir -p "{}"'.format(folder))
print('\nDownloading {}\n\tPages'.format(folder))
for page in pages:
img_url = html_page.find(attrs={'class':'coverIssue'}).find('img').get('src')
current_page = html_page.title.string.split(' - Page #')[1]
print(" ==> {}".format(current_page), end=' ')
os.system('wget -q -nc -c "{}" -O "{}"'.format(img_url, folder+'/'+current_page+'.jpg'))
print("✓")
# urllib.request.urlretrieve(img_url, img_url[70:])
nxt_page = html_page.find(attrs={'class':'coverIssue'}).a.get('href')
html_page = BeautifulSoup(urllib.request.urlopen(urllib.request.Request(nxt_page, None, headers)).read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment