Created
August 15, 2012 00:53
-
-
Save anonymous/3354336 to your computer and use it in GitHub Desktop.
Webcomic Downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import sys | |
from PyQt4 import QtGui, QtCore | |
from scrape_ui import Ui_PieAndCake | |
import requests | |
from time import gmtime, strftime | |
import os | |
import platform | |
import getpass | |
username = getpass.getuser() | |
usr_os = platform.system() | |
def Homestuck(): | |
print 'Initiating download of the complete Homestuck archive.' | |
#HTML | |
rootdata = "/home/" + username + "/Documents/Homestuck/" | |
print rootdata | |
#images | |
rootimg = "/home/" + username + "/Documents/Homestuck/images/" | |
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime()) | |
#content is how many pages you want to download. Comment it out for auto-detection. | |
#The first 31 pages include all the differnt types of pages in the series: .gif, multigif, and .swf files. | |
global content | |
content = 1 | |
#endval is the comic's identification number | |
endval = "001901" | |
#root page and image urls | |
page = "http://www.mspaintadventures.com/?s=6&p=" | |
imgroot = "http://www.mspaintadventures.com/storyfiles/hs2/" | |
favipath = rootdata + "favicon.ico" | |
#setting up variables for later | |
imgval = 0 | |
global imgval | |
flashcounter = 0 | |
#Homestuck uses non-unicode symbols in some panels. This makes python crash, so we'll | |
#remove the symbols. | |
trans_table = ''.join( [chr(i) for i in range(128)] + [' '] * 128 ) | |
#create the folders for the data if they dont exist | |
if not os.path.exists(rootdata): | |
os.makedirs(rootdata) | |
if not os.path.exists(rootimg): | |
os.makedirs(rootimg) | |
#download alignment images used on every page | |
fnames = ["v2_blankstrip.gif", | |
"v2_blanksquare.gif", | |
"spacer.gif", | |
"v2_blanksquare2.gif", | |
"v2_blanksquare3.gif", | |
"favicon.ico"] | |
for i, name in enumerate(fnames): | |
print "Fetching spacers... (%s/5)" % i | |
f = requests.get("http://www.mspaintadventures.com/images/" + name) | |
q = open(rootdata + name, 'w+') | |
q.write(f.content) | |
q.close() | |
#script will now attempt to identify how many comics there are. | |
print "Identifying amount of content to download..." | |
r = requests.get("http://www.mspaintadventures.com/") | |
html = r.text | |
start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">') | |
end = html.find("<!-----------------------LATEST PAGES----------------------------->") | |
html = html[:start] + html[end:] | |
start = html.find("<!-----------------------END LATEST PAGES------------------------->") | |
end = html.find("</html>") | |
html = html[:start] + html[end:] | |
start = html.find("<!-----------------------LATEST PAGES----------------------------->") | |
end = html.find('?s=6&p=') | |
html = html[:start] + html[end:] | |
start = html.find('">"') | |
end = html.find('</html>') | |
html = html[:start] + html[end:] | |
html = html[13:] | |
html = html[:6] | |
currentcomicval = html | |
currentcomicval = int(currentcomicval) - 1901 | |
print "Downloading " + str(currentcomicval) + " pages of comics." | |
content = int(currentcomicval) | |
global h_content | |
h_content = content | |
#main loop | |
while content > 0: | |
#set up file detection service. In need to know if there's a .gif, multiple .gifs, or a .swf | |
gif =True | |
multigif = True | |
flash = True | |
multigifid = 1 | |
imgval = int(imgval) + 1 | |
imgval = str(imgval).rjust(5,'0') | |
urlgif = str(imgroot) + str(imgval) + ".gif" | |
urlmultigif = str(imgroot) + str(imgval) + "_" + str(multigifid) + ".gif" | |
flaurl = str(imgroot) + str(imgval) +"/" + str(imgval) + ".swf" | |
urlgifpath = rootdata + str(imgval) + ".gif" | |
urlmultigifpath = rootdata + str(imgval) + "_1.gif" | |
flashpath = rootdata + str(imgval) + ".swf" | |
#This checks to see if the file already exists; if it doesn't, it downloads it | |
if not os.path.exists(urlgifpath) and not os.path.exists(urlmultigifpath) and not os.path.exists(flashpath): | |
gif = True | |
multigif = True | |
Flash = True | |
response = requests.get(urlgif) | |
if response.status_code == 404: | |
gif = False | |
response = requests.get(urlmultigif) | |
if response.status_code == 404: | |
multigif = False | |
response = requests.get(flaurl) | |
if response.status_code == 404: | |
Flash = False | |
#now to download the file | |
#regular, single .gifs | |
if multigif == False and Flash == False: | |
try: f = requests.get(urlgif) | |
except f.statuscode == 404: | |
print "Something went wrong while downloading the .gif." | |
print urlgif | |
break | |
imgpath = rootimg + str(imgval) + ".gif" | |
q = open(imgpath, 'w+') | |
q.write(f.content) | |
q.close() | |
#more than 1 gif on a page | |
elif gif == False and Flash == False: | |
gifstatus = True | |
while gifstatus == True: | |
urlmultigif = imgroot + imgval + "_" + str(multigifid) + ".gif" | |
print urlmultigif | |
f = requests.get(urlmultigif) | |
if f.status_code == 404: | |
gifstatus = False | |
break | |
imgpath = rootimg + str(imgval) + "_" + str(multigifid) + ".gif" | |
q = open(imgpath, 'w+') | |
q.write(f.content) | |
q.close() | |
multigifid += 1 | |
#Flash content | |
elif gif == False and multigif == False: | |
flashcounter +=1 | |
swfurl = imgroot + imgval + "/" + imgval + ".swf" | |
print swfurl | |
f = requests.get(swfurl) | |
imgpath = rootimg + str(imgval) + ".swf" | |
q = open(imgpath, 'w+') | |
q.write(f.content) | |
q.close() | |
else: | |
print "Something went horribly wrong!" | |
else: | |
print "Image number " + imgval + " skipped." | |
# Now we download the html | |
root = rootdata + str(endval) + ".html" | |
if not os.path.exists(root): | |
#create page id | |
url = page + str(endval) | |
#open the webpage | |
response = requests.get(url) | |
html = response.text | |
#write data to file and fix path associations | |
q = open(root, 'w+') | |
#fix paths and whatnot | |
html = html.replace("http://www.mspaintadventures.com/storyfiles/hs2/", rootdata) | |
htmlpath = rootdata + str(endval) + ".html" | |
start = html.find("<!------------------------end comic content----------------------------------->") | |
end = html.find("</html>") | |
html = html[:start] + html[end:] | |
start = html.find("<!------------------------begin nav----------------------------------->") | |
end = html.find("<!------------------------end nav----------------------------------->") | |
html = html[:start] + html[end:] | |
html = html.replace(str(endval), "") | |
#we need to increase the emdval by one to link to the next comic | |
endval = int(endval) + 1 | |
endval = str(endval).rjust(6,'0') | |
htmlpath = rootdata + str(endval) + ".html" | |
html = html.replace("?s=6&p=" + endval, htmlpath) | |
html = html.replace("images/", rootdata) | |
html = html.replace("favicon.ico", favipath) | |
#flash URL repair code | |
if gif == False and multigif == False: | |
print "Repairing flash code..." | |
start = html.find('<script language="javascript">AC_FL_RunContent = 0;</script>') | |
objns = ''' </object> | |
</noscript>''' | |
end = html.find(objns) | |
html = html[:start] + html[end + 1:] | |
swffilelink = rootdata + str(imgval) + ".swf" | |
swflink = "<a href=" + '"' + swffilelink + '"' + 'target="_self" name="Flash Content Link">Click here for flash</a>' | |
html = html.replace("</object>", swflink) | |
html = html.encode('ascii', 'ignore') | |
q.write(html) | |
q.close() | |
else: | |
endval = int(endval) + 1 | |
endval = str(endval).rjust(6,'0') | |
print "html page " + endval + " skipped." | |
content -= 1 | |
myapp.homeBar() | |
print "Finsihed downloading @:", strftime("%Y-%m-%d %H:%M:%S", gmtime()) | |
# End Homestuck download code | |
def QC(): | |
print 'Initiating download of the complete Qestionable Content archive.' | |
#HTML | |
rootdir = "/home/" + username + "/Documents/Questionable Content/" | |
#CSS Local | |
localcss = rootdir + 'newstyles.css' | |
#local logo | |
locallogo = rootdir + "logo.png" | |
#images | |
rootdata = "/home/" + username + "/Documents/Questionable Content/comics/" | |
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime()) | |
#root page and image urls | |
page = "http://www.questionablecontent.net/" | |
imgroot = "http://www.questionablecontent.net/comics/" | |
css = "http://questionablecontent.net/newstyles.css" | |
logourl = "http://questionablecontent.net/testing/logo.png" | |
favipath = rootdata + "favicon." | |
if not os.path.exists(rootdir): | |
os.mkdir(rootdir) | |
if not os.path.exists(rootdata): | |
os.mkdir(rootdata) | |
if not os.path.exists(localcss): | |
r = requests.get(css) | |
css = r.text | |
q = open(localcss, 'w+') | |
q.write(css) | |
q.close() | |
if not os.path.exists(locallogo): | |
r = requests.get(logourl) | |
logo = r.content | |
q = open(locallogo, 'w+') | |
q.write(logo) | |
q.close() | |
#get current comic id. | |
print "Identifying amount of content to download..." | |
r = requests.get("http://www.questionablecontent.net") | |
html = r.text | |
start = html.find('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">') | |
end = html.find('<img id="strip" src="http://www.questionablecontent.net/comics/') | |
html = html[:start] + html[end:] | |
start = html.find('.png">') | |
end = html.find('</html>') | |
html = html[:start] + html[end:] | |
html = html[64:] | |
html = html[:4] | |
global qontent | |
qontent = html | |
global qc_content | |
qc_content = qontent | |
global qmgval | |
qmgval = 1 | |
urlroot = 'http://questionablecontent.net/view.php?comic=' | |
print qontent | |
# main downlaod loop | |
while qontent > 0: | |
url = urlroot + str(qmgval) + '.html' | |
print url | |
localpage = rootdir + str(qmgval) + '.html' | |
localimage = rootdata + str(qmgval) + '.png' | |
imgurl = imgroot + str(qmgval) + '.png' | |
#get the webpage | |
if not os.path.exists(localpage): | |
r = requests.get(urlroot) | |
html = r.text | |
#fix file path associations and write content to local file | |
nextcomic = int(qmgval) + 1 | |
html = html.replace('./comics/', './comics/' + str(qmgval) + '.png') | |
html = html.replace('../testing/logo.png', './logo.png') | |
html = html.replace('view.php?comic=1', rootdir + str(nextcomic) + '.html') | |
start = html.find('<b>Warning</b>') | |
end = html.find('<b>74</b><br />') | |
html = html[:start] + html[end + 5:] | |
q = open(localpage, 'w+') | |
q.write(html) | |
q.close() | |
#get the image | |
print imgurl | |
if not os.path.exists(localimage): | |
r = requests.get(imgurl) | |
image = r.content | |
q = open(localimage, 'w+') | |
q.write(image) | |
q.close() | |
qmgval = int(qmgval) + 1 | |
qontent = int(qontent) - 1 | |
myapp.qc_bar() | |
#unfinished, WordPress blogs needs more work | |
def bug(): | |
print 'Initiating download of the complete Bug archive.' | |
#HTML | |
rootdir = "/home/" + username + "/Documents/Bug/" | |
#CSS Local | |
localcss = rootdir + 'style.css' | |
#images | |
rootdata = "/home/" + username + "/Documents/Bug/comics/" | |
print "Program started @", strftime("%Y-%m-%d %H:%M:%S", gmtime()) | |
#root page and image urls | |
page = "http://www.bugcomic.com/" | |
imgroot = "http://www.bugcomic.com/comics/" | |
css = "http://www.bugcomic.com/wp-content/themes/comicpress-sandy/style.css" | |
if not os.path.exists(rootdir): | |
os.mkdir(rootdir) | |
if not os.path.exists(rootdata): | |
os.mkdir(rootdata) | |
if not os.path.exists(localcss): | |
r = requests.get(css) | |
css = r.text | |
q = open(localcss, 'w+') | |
q.write(css) | |
q.close() | |
#get the first comic. I don't know of any way to count comics on WordPress-based sites. | |
firsturl = 'http://www.bugcomic.com/comics/letter/' | |
global bontent | |
bontent = 0 | |
r = requests.get(firsturl) | |
html = r.text | |
html = html.encode('ascii', 'ignore') | |
htmlbackup = html | |
start = html.find('<!DOCTYPE html>') | |
end = html.find('<div id="comic-1" class="comicpane"><a href="') | |
html = html[:start + 44] + html[end + 1:] | |
start = html.find('<div id="subcontent-wrapper">') | |
end = html.find('<!-- Compression = gzip -->') | |
html = html[:start] + html[end + 25:] | |
html = html.replace('http://www.bugcomic.com/', rootdir) | |
q = open(rootdir + str(bontent) + '.html', 'w+') | |
q.write(html) | |
q.close | |
html = htmlbackup | |
html = html.encode('ascii', 'ignore') | |
start = html.find('class="navi navi-next" title="Next">Next</a>') | |
end = html.find('<!-- Compression = gzip -->') | |
html = html[:start] + html[end + 25:] | |
start = html.find('<!DOCTYPE html>') | |
end = html.find('<td class="comic_navi_right">') | |
html = html[:start] + html[end + 1:] | |
start = html.find('td class=') | |
end = html.find('<a href="') | |
html = html[:start] + html[end:] | |
start = html.find('<a hr') | |
end = html.find('ef="') | |
html = html[:start] + html[end + 4:] | |
html = html[:-4] | |
nexturl = html | |
class MyApp(QtGui.QMainWindow): | |
def __init__(self): | |
QtGui.QMainWindow.__init__(self) | |
self.ui = Ui_PieAndCake() | |
self.ui.setupUi(self) | |
self.ui.the_button.clicked.connect(self.display_results) | |
def display_results(self): | |
if self.ui.cake_check.isChecked(): | |
QC() | |
if self.ui.pie_check.isChecked(): | |
Homestuck() | |
if self.ui.bugCheck.isChecked(): | |
bug() | |
def homeBar(self): | |
# h_content = total | content = current comic | |
addval = ((int(imgval) * 100) / h_content) | |
self.ui.hBar.setValue(addval) | |
def qc_bar(self): | |
addval = ((int(qmgval) * 100) / int(qc_content)) | |
self.ui.qcBar.setValue(addval) | |
def bugBar(self): | |
print "bug comic" | |
if __name__ == '__main__': | |
app = QtGui.QApplication(sys.argv) | |
myapp = MyApp() | |
myapp.show() | |
sys.exit(app.exec_()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Form implementation generated from reading ui file 'ComicScrape.ui' | |
# | |
# Created: Thu Jul 19 19:13:41 2012 | |
# by: PyQt4 UI code generator 4.9.1 | |
# | |
# WARNING! All changes made in this file will be lost! | |
from PyQt4 import QtCore, QtGui | |
try: | |
_fromUtf8 = QtCore.QString.fromUtf8 | |
except AttributeError: | |
_fromUtf8 = lambda s: s | |
class Ui_PieAndCake(object): | |
def setupUi(self, PieAndCake): | |
PieAndCake.setObjectName(_fromUtf8("PieAndCake")) | |
PieAndCake.resize(368, 140) | |
self.centralwidget = QtGui.QWidget(PieAndCake) | |
self.centralwidget.setObjectName(_fromUtf8("centralwidget")) | |
self.pie_check = QtGui.QCheckBox(self.centralwidget) | |
self.pie_check.setGeometry(QtCore.QRect(10, 10, 121, 19)) | |
self.pie_check.setObjectName(_fromUtf8("pie_check")) | |
self.cake_check = QtGui.QCheckBox(self.centralwidget) | |
self.cake_check.setGeometry(QtCore.QRect(10, 40, 181, 19)) | |
self.cake_check.setObjectName(_fromUtf8("cake_check")) | |
self.the_button = QtGui.QPushButton(self.centralwidget) | |
self.the_button.setGeometry(QtCore.QRect(90, 110, 171, 24)) | |
self.the_button.setObjectName(_fromUtf8("the_button")) | |
self.hBar = QtGui.QProgressBar(self.centralwidget) | |
self.hBar.setGeometry(QtCore.QRect(230, 10, 131, 23)) | |
self.hBar.setProperty("value", 0) | |
self.hBar.setObjectName(_fromUtf8("hBar")) | |
self.qcBar = QtGui.QProgressBar(self.centralwidget) | |
self.qcBar.setGeometry(QtCore.QRect(230, 40, 131, 23)) | |
self.qcBar.setProperty("value", 0) | |
self.qcBar.setObjectName(_fromUtf8("qcBar")) | |
self.bugCheck = QtGui.QCheckBox(self.centralwidget) | |
self.bugCheck.setGeometry(QtCore.QRect(10, 70, 141, 22)) | |
self.bugCheck.setObjectName(_fromUtf8("bugCheck")) | |
self.bugBar = QtGui.QProgressBar(self.centralwidget) | |
self.bugBar.setGeometry(QtCore.QRect(230, 70, 131, 23)) | |
self.bugBar.setProperty("value", 0) | |
self.bugBar.setObjectName(_fromUtf8("bugBar")) | |
PieAndCake.setCentralWidget(self.centralwidget) | |
self.retranslateUi(PieAndCake) | |
QtCore.QMetaObject.connectSlotsByName(PieAndCake) | |
def retranslateUi(self, PieAndCake): | |
PieAndCake.setWindowTitle(QtGui.QApplication.translate("PieAndCake", "Webcomic Downloader", None, QtGui.QApplication.UnicodeUTF8)) | |
self.pie_check.setText(QtGui.QApplication.translate("PieAndCake", "Homestuck", None, QtGui.QApplication.UnicodeUTF8)) | |
self.cake_check.setText(QtGui.QApplication.translate("PieAndCake", "Questionable Content", None, QtGui.QApplication.UnicodeUTF8)) | |
self.the_button.setText(QtGui.QApplication.translate("PieAndCake", "Initiate Download", None, QtGui.QApplication.UnicodeUTF8)) | |
self.bugCheck.setText(QtGui.QApplication.translate("PieAndCake", "Bug", None, QtGui.QApplication.UnicodeUTF8)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment