Skip to content

Instantly share code, notes, and snippets.

Created August 15, 2012 00:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/3354336 to your computer and use it in GitHub Desktop.
Save anonymous/3354336 to your computer and use it in GitHub Desktop.
Webcomic Downloader
#!/usr/bin/env python2
import sys
from PyQt4 import QtGui, QtCore
from scrape_ui import Ui_PieAndCake
import requests
from time import gmtime, strftime
import os
import platform
import getpass
username = getpass.getuser()
usr_os = platform.system()
def Homestuck():
print 'Initiating download of the complete Homestuck archive.'
#HTML
rootdata = "/home/" + username + "/Documents/Homestuck/"
print rootdata
#images
rootimg = "/home/" + username + "/Documents/Homestuck/images/"
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
#content is how many pages you want to download. Comment it out for auto-detection.
#The first 31 pages include all the differnt types of pages in the series: .gif, multigif, and .swf files.
global content
content = 1
#endval is the comic's identification number
endval = "001901"
#root page and image urls
page = "http://www.mspaintadventures.com/?s=6&p="
imgroot = "http://www.mspaintadventures.com/storyfiles/hs2/"
favipath = rootdata + "favicon.ico"
#setting up variables for later
imgval = 0
global imgval
flashcounter = 0
#Homestuck uses non-unicode symbols in some panels. This makes python crash, so we'll
#remove the symbols.
trans_table = ''.join( [chr(i) for i in range(128)] + [' '] * 128 )
#create the folders for the data if they dont exist
if not os.path.exists(rootdata):
os.makedirs(rootdata)
if not os.path.exists(rootimg):
os.makedirs(rootimg)
#download alignment images used on every page
fnames = ["v2_blankstrip.gif",
"v2_blanksquare.gif",
"spacer.gif",
"v2_blanksquare2.gif",
"v2_blanksquare3.gif",
"favicon.ico"]
for i, name in enumerate(fnames):
print "Fetching spacers... (%s/5)" % i
f = requests.get("http://www.mspaintadventures.com/images/" + name)
q = open(rootdata + name, 'w+')
q.write(f.content)
q.close()
#script will now attempt to identify how many comics there are.
print "Identifying amount of content to download..."
r = requests.get("http://www.mspaintadventures.com/")
html = r.text
start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
end = html.find("<!-----------------------LATEST PAGES----------------------------->")
html = html[:start] + html[end:]
start = html.find("<!-----------------------END LATEST PAGES------------------------->")
end = html.find("</html>")
html = html[:start] + html[end:]
start = html.find("<!-----------------------LATEST PAGES----------------------------->")
end = html.find('?s=6&p=')
html = html[:start] + html[end:]
start = html.find('">"')
end = html.find('</html>')
html = html[:start] + html[end:]
html = html[13:]
html = html[:6]
currentcomicval = html
currentcomicval = int(currentcomicval) - 1901
print "Downloading " + str(currentcomicval) + " pages of comics."
content = int(currentcomicval)
global h_content
h_content = content
#main loop
while content > 0:
#set up file detection service. In need to know if there's a .gif, multiple .gifs, or a .swf
gif =True
multigif = True
flash = True
multigifid = 1
imgval = int(imgval) + 1
imgval = str(imgval).rjust(5,'0')
urlgif = str(imgroot) + str(imgval) + ".gif"
urlmultigif = str(imgroot) + str(imgval) + "_" + str(multigifid) + ".gif"
flaurl = str(imgroot) + str(imgval) +"/" + str(imgval) + ".swf"
urlgifpath = rootdata + str(imgval) + ".gif"
urlmultigifpath = rootdata + str(imgval) + "_1.gif"
flashpath = rootdata + str(imgval) + ".swf"
#This checks to see if the file already exists; if it doesn't, it downloads it
if not os.path.exists(urlgifpath) and not os.path.exists(urlmultigifpath) and not os.path.exists(flashpath):
gif = True
multigif = True
Flash = True
response = requests.get(urlgif)
if response.status_code == 404:
gif = False
response = requests.get(urlmultigif)
if response.status_code == 404:
multigif = False
response = requests.get(flaurl)
if response.status_code == 404:
Flash = False
#now to download the file
#regular, single .gifs
if multigif == False and Flash == False:
try: f = requests.get(urlgif)
except f.statuscode == 404:
print "Something went wrong while downloading the .gif."
print urlgif
break
imgpath = rootimg + str(imgval) + ".gif"
q = open(imgpath, 'w+')
q.write(f.content)
q.close()
#more than 1 gif on a page
elif gif == False and Flash == False:
gifstatus = True
while gifstatus == True:
urlmultigif = imgroot + imgval + "_" + str(multigifid) + ".gif"
print urlmultigif
f = requests.get(urlmultigif)
if f.status_code == 404:
gifstatus = False
break
imgpath = rootimg + str(imgval) + "_" + str(multigifid) + ".gif"
q = open(imgpath, 'w+')
q.write(f.content)
q.close()
multigifid += 1
#Flash content
elif gif == False and multigif == False:
flashcounter +=1
swfurl = imgroot + imgval + "/" + imgval + ".swf"
print swfurl
f = requests.get(swfurl)
imgpath = rootimg + str(imgval) + ".swf"
q = open(imgpath, 'w+')
q.write(f.content)
q.close()
else:
print "Something went horribly wrong!"
else:
print "Image number " + imgval + " skipped."
# Now we download the html
root = rootdata + str(endval) + ".html"
if not os.path.exists(root):
#create page id
url = page + str(endval)
#open the webpage
response = requests.get(url)
html = response.text
#write data to file and fix path associations
q = open(root, 'w+')
#fix paths and whatnot
html = html.replace("http://www.mspaintadventures.com/storyfiles/hs2/", rootdata)
htmlpath = rootdata + str(endval) + ".html"
start = html.find("<!------------------------end comic content----------------------------------->")
end = html.find("</html>")
html = html[:start] + html[end:]
start = html.find("<!------------------------begin nav----------------------------------->")
end = html.find("<!------------------------end nav----------------------------------->")
html = html[:start] + html[end:]
html = html.replace(str(endval), "")
#we need to increase the emdval by one to link to the next comic
endval = int(endval) + 1
endval = str(endval).rjust(6,'0')
htmlpath = rootdata + str(endval) + ".html"
html = html.replace("?s=6&p=" + endval, htmlpath)
html = html.replace("images/", rootdata)
html = html.replace("favicon.ico", favipath)
#flash URL repair code
if gif == False and multigif == False:
print "Repairing flash code..."
start = html.find('<script language="javascript">AC_FL_RunContent = 0;</script>')
objns = ''' </object>
</noscript>'''
end = html.find(objns)
html = html[:start] + html[end + 1:]
swffilelink = rootdata + str(imgval) + ".swf"
swflink = "<a href=" + '"' + swffilelink + '"' + 'target="_self" name="Flash Content Link">Click here for flash</a>'
html = html.replace("</object>", swflink)
html = html.encode('ascii', 'ignore')
q.write(html)
q.close()
else:
endval = int(endval) + 1
endval = str(endval).rjust(6,'0')
print "html page " + endval + " skipped."
content -= 1
myapp.homeBar()
print "Finsihed downloading @:", strftime("%Y-%m-%d %H:%M:%S", gmtime())
# End Homestuck download code
def QC():
print 'Initiating download of the complete Qestionable Content archive.'
#HTML
rootdir = "/home/" + username + "/Documents/Questionable Content/"
#CSS Local
localcss = rootdir + 'newstyles.css'
#local logo
locallogo = rootdir + "logo.png"
#images
rootdata = "/home/" + username + "/Documents/Questionable Content/comics/"
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
#root page and image urls
page = "http://www.questionablecontent.net/"
imgroot = "http://www.questionablecontent.net/comics/"
css = "http://questionablecontent.net/newstyles.css"
logourl = "http://questionablecontent.net/testing/logo.png"
favipath = rootdata + "favicon."
if not os.path.exists(rootdir):
os.mkdir(rootdir)
if not os.path.exists(rootdata):
os.mkdir(rootdata)
if not os.path.exists(localcss):
r = requests.get(css)
css = r.text
q = open(localcss, 'w+')
q.write(css)
q.close()
if not os.path.exists(locallogo):
r = requests.get(logourl)
logo = r.content
q = open(locallogo, 'w+')
q.write(logo)
q.close()
#get current comic id.
print "Identifying amount of content to download..."
r = requests.get("http://www.questionablecontent.net")
html = r.text
start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">')
end = html.find('<img id="strip" src="http://www.questionablecontent.net/comics/')
html = html[:start] + html[end:]
start = html.find('.png">')
end = html.find('</html>')
html = html[:start] + html[end:]
html = html[64:]
html = html[:4]
global qontent
qontent = html
global qc_content
qc_content = qontent
global qmgval
qmgval = 1
urlroot = 'http://questionablecontent.net/view.php?comic='
print qontent
# main downlaod loop
while qontent > 0:
url = urlroot + str(qmgval) + '.html'
print url
localpage = rootdir + str(qmgval) + '.html'
localimage = rootdata + str(qmgval) + '.png'
imgurl = imgroot + str(qmgval) + '.png'
#get the webpage
if not os.path.exists(localpage):
r = requests.get(urlroot)
html = r.text
#fix file path associations and write content to local file
nextcomic = int(qmgval) + 1
html = html.replace('./comics/', './comics/' + str(qmgval) + '.png')
html = html.replace('../testing/logo.png', './logo.png')
html = html.replace('view.php?comic=1', rootdir + str(nextcomic) + '.html')
start = html.find('<b>Warning</b>')
end = html.find('<b>74</b><br />')
html = html[:start] + html[end + 5:]
q = open(localpage, 'w+')
q.write(html)
q.close()
#get the image
print imgurl
if not os.path.exists(localimage):
r = requests.get(imgurl)
image = r.content
q = open(localimage, 'w+')
q.write(image)
q.close()
qmgval = int(qmgval) + 1
qontent = int(qontent) - 1
myapp.qc_bar()
#unfinished, WordPress blogs needs more work
def bug():
print 'Initiating download of the complete Bug archive.'
#HTML
rootdir = "/home/" + username + "/Documents/Bug/"
#CSS Local
localcss = rootdir + 'style.css'
#images
rootdata = "/home/" + username + "/Documents/Bug/comics/"
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime())
#root page and image urls
page = "http://www.bugcomic.com/"
imgroot = "http://www.bugcomic.com/comics/"
css = "http://www.bugcomic.com/wp-content/themes/comicpress-sandy/style.css"
if not os.path.exists(rootdir):
os.mkdir(rootdir)
if not os.path.exists(rootdata):
os.mkdir(rootdata)
if not os.path.exists(localcss):
r = requests.get(css)
css = r.text
q = open(localcss, 'w+')
q.write(css)
q.close()
#get the first comic. I don't know of any way to count comics on WordPress-based sites.
firsturl = 'http://www.bugcomic.com/comics/letter/'
global bontent
bontent = 0
r = requests.get(firsturl)
html = r.text
html = html.encode('ascii', 'ignore')
htmlbackup = html
start = html.find('<!DOCTYPE html>')
end = html.find('<div id="comic-1" class="comicpane"><a href="')
html = html[:start + 44] + html[end + 1:]
start = html.find('<div id="subcontent-wrapper">')
end = html.find('<!-- Compression = gzip -->')
html = html[:start] + html[end + 25:]
html = html.replace('http://www.bugcomic.com/', rootdir)
q = open(rootdir + str(bontent) + '.html', 'w+')
q.write(html)
q.close
html = htmlbackup
html = html.encode('ascii', 'ignore')
start = html.find('class="navi navi-next" title="Next">Next</a>')
end = html.find('<!-- Compression = gzip -->')
html = html[:start] + html[end + 25:]
start = html.find('<!DOCTYPE html>')
end = html.find('<td class="comic_navi_right">')
html = html[:start] + html[end + 1:]
start = html.find('td class=')
end = html.find('<a href="')
html = html[:start] + html[end:]
start = html.find('<a hr')
end = html.find('ef="')
html = html[:start] + html[end + 4:]
html = html[:-4]
nexturl = html
class MyApp(QtGui.QMainWindow):
def __init__(self):
QtGui.QMainWindow.__init__(self)
self.ui = Ui_PieAndCake()
self.ui.setupUi(self)
self.ui.the_button.clicked.connect(self.display_results)
def display_results(self):
if self.ui.cake_check.isChecked():
QC()
if self.ui.pie_check.isChecked():
Homestuck()
if self.ui.bugCheck.isChecked():
bug()
def homeBar(self):
# h_content = total | content = current comic
addval = ((int(imgval) * 100) / h_content)
self.ui.hBar.setValue(addval)
def qc_bar(self):
addval = ((int(qmgval) * 100) / int(qc_content))
self.ui.qcBar.setValue(addval)
def bugBar(self):
print "bug comic"
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
myapp = MyApp()
myapp.show()
sys.exit(app.exec_())
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'ComicScrape.ui'
#
# Created: Thu Jul 19 19:13:41 2012
# by: PyQt4 UI code generator 4.9.1
#
# WARNING! All changes made in this file will be lost!
from PyQt4 import QtCore, QtGui
try:
_fromUtf8 = QtCore.QString.fromUtf8
except AttributeError:
_fromUtf8 = lambda s: s
class Ui_PieAndCake(object):
def setupUi(self, PieAndCake):
PieAndCake.setObjectName(_fromUtf8("PieAndCake"))
PieAndCake.resize(368, 140)
self.centralwidget = QtGui.QWidget(PieAndCake)
self.centralwidget.setObjectName(_fromUtf8("centralwidget"))
self.pie_check = QtGui.QCheckBox(self.centralwidget)
self.pie_check.setGeometry(QtCore.QRect(10, 10, 121, 19))
self.pie_check.setObjectName(_fromUtf8("pie_check"))
self.cake_check = QtGui.QCheckBox(self.centralwidget)
self.cake_check.setGeometry(QtCore.QRect(10, 40, 181, 19))
self.cake_check.setObjectName(_fromUtf8("cake_check"))
self.the_button = QtGui.QPushButton(self.centralwidget)
self.the_button.setGeometry(QtCore.QRect(90, 110, 171, 24))
self.the_button.setObjectName(_fromUtf8("the_button"))
self.hBar = QtGui.QProgressBar(self.centralwidget)
self.hBar.setGeometry(QtCore.QRect(230, 10, 131, 23))
self.hBar.setProperty("value", 0)
self.hBar.setObjectName(_fromUtf8("hBar"))
self.qcBar = QtGui.QProgressBar(self.centralwidget)
self.qcBar.setGeometry(QtCore.QRect(230, 40, 131, 23))
self.qcBar.setProperty("value", 0)
self.qcBar.setObjectName(_fromUtf8("qcBar"))
self.bugCheck = QtGui.QCheckBox(self.centralwidget)
self.bugCheck.setGeometry(QtCore.QRect(10, 70, 141, 22))
self.bugCheck.setObjectName(_fromUtf8("bugCheck"))
self.bugBar = QtGui.QProgressBar(self.centralwidget)
self.bugBar.setGeometry(QtCore.QRect(230, 70, 131, 23))
self.bugBar.setProperty("value", 0)
self.bugBar.setObjectName(_fromUtf8("bugBar"))
PieAndCake.setCentralWidget(self.centralwidget)
self.retranslateUi(PieAndCake)
QtCore.QMetaObject.connectSlotsByName(PieAndCake)
def retranslateUi(self, PieAndCake):
PieAndCake.setWindowTitle(QtGui.QApplication.translate("PieAndCake", "Webcomic Downloader", None, QtGui.QApplication.UnicodeUTF8))
self.pie_check.setText(QtGui.QApplication.translate("PieAndCake", "Homestuck", None, QtGui.QApplication.UnicodeUTF8))
self.cake_check.setText(QtGui.QApplication.translate("PieAndCake", "Questionable Content", None, QtGui.QApplication.UnicodeUTF8))
self.the_button.setText(QtGui.QApplication.translate("PieAndCake", "Initiate Download", None, QtGui.QApplication.UnicodeUTF8))
self.bugCheck.setText(QtGui.QApplication.translate("PieAndCake", "Bug", None, QtGui.QApplication.UnicodeUTF8))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment