Create a gist now

Instantly share code, notes, and snippets.

@hzqtc /
Last active Dec 16, 2015

What would you like to do?
Download Jiandan OOXX gallery.
#!/usr/bin/env python
# Example:
# curl -s | ./ -o 5 -r 2.0 -u | wget -nv -i -
# This will download all pictures with oo >= 5 in the last page of Jiandan OOXX.
import HTMLParser
import sys
import getopt
class OOXXImage(object):
def __init__(self):
self.oo = 0
self.xx = 0
self.url = ""
def test(self, ooMin, xxMax, ratio):
if self.xx == 0:
self.xx = 1
return self.oo >= ooMin and self.xx <= xxMax and float(self.oo) / float(self.xx) >= ratio
class JDOOXXParser(HTMLParser.HTMLParser):
def __init__(self):
self.withinPostLi = False
self.fetchData = None
self.currentImage = OOXXImage()
self.imageList = []
def handle_starttag(self, tag, attrs):
attrMap = dict(attrs)
if self.withinPostLi == True:
if tag == "img" and "class" not in attrMap:
self.currentImage.url = attrMap["src"]
elif tag == "span" and "id" in attrMap and attrMap["id"].startswith("cos_support-"):
self.fetchData = "oo"
elif tag == "span" and "id" in attrMap and attrMap["id"].startswith("cos_unsupport-"):
self.fetchData = "xx"
if tag == "li" and "id" in attrMap and attrMap["id"].startswith("comment-"):
self.withinPostLi = True
def handle_endtag(self, tag):
if tag == "li" and self.withinPostLi == True:
self.withinPostLi = False
self.currentImage = OOXXImage()
def handle_data(self, data):
if self.fetchData == "oo":
self.currentImage.oo = int(data)
self.fetchData = None
elif self.fetchData == "xx":
self.currentImage.xx = int(data)
self.fetchData = None
def usage():
print "A parser for Jiandan ( MM Gallery."
print "Read from standard input and print image informations."
print " -h, --help Print this infomation."
print " -o, --oomin=INT Set the minimum OO value; images with a lower OO value will be excluded."
print " -x, --xxmax=INT Set the maximum XX value; images with a greater XX value will be excluded."
print " -r, --ratio=FLOAT Set the OO/XX ratio; images with lower ratio will be excluded"
print " -u, --urlonly Only print the URLs; both OO and XX value will be omitted."
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], "ho:x:r:u", ["help", "oomin=", "xxmax=", "ratio=", "urlonly"])
except getopt.GetoptError as err:
print str(err)
ooMin = 0
xxMax = 1000
ratio = -1
urlOnly = False
for o, a in opts:
if o in ("-h", "--help"):
elif o in ("-o", "--oomin"):
ooMin = int(a)
elif o in ("-x", "--xxmax"):
xxMax = int(a)
elif o in ("-r", "--ratio"):
ratio = float(a)
elif o in ("-u", "--urlonly"):
urlOnly = True
parser = JDOOXXParser()
for img in parser.imageList:
if img.test(ooMin, xxMax, ratio):
if urlOnly:
print img.url
print "oo = %3d, xx = %3d, %s" % (img.oo, img.xx, img.url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment