Created
February 7, 2011 21:52
-
-
Save andy722/815310 to your computer and use it in GitHub Desktop.
Downloads all photos from Miss Photo NSU-2010 web site (which is *really* unusable)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib, sgmllib | |
import re | |
import os, shutil | |
class IDParser(sgmllib.SGMLParser): | |
'''Returns list of contestant IDs from main page''' | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
self.ids = [] | |
def parse(self, s): | |
self.feed(s) | |
self.close() | |
return self.ids | |
def reset(self): | |
sgmllib.SGMLParser.reset(self) | |
self.ids = [] | |
def start_a(self, attrs): | |
for k,v in attrs: | |
if k == "class" and v == "list": | |
self.__parse_href(attrs) | |
def __parse_href(self, attrs): | |
for k,v in attrs: | |
if k == "href": | |
id = re.findall(r"\?n=(\d+)\&sort=date", v)[0] | |
self.ids += [ int(id) ] | |
class NameParser(sgmllib.SGMLParser): | |
'''Returns name and latin alias from contestant's page''' | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
self.reset() | |
def reset(self): | |
sgmllib.SGMLParser.reset(self) | |
self.name = "" | |
self.latin = "" | |
self.in_name = 0 | |
def parse(self, s): | |
self.feed(s) | |
self.close() | |
return self.name, self.latin | |
def start_h1(self, attrs): | |
for k,v in attrs: | |
if k == "class" and v == "name": | |
self.in_name = 1 | |
def end_h1(self): | |
self.in_name = 0 | |
def handle_data(self, data): | |
if self.in_name: | |
self.name = data | |
def start_img(self, attrs): | |
for k,v in attrs: | |
if k == "src": | |
self.latin = re.findall(r"foto/(.*?)-\d.jpg", v)[0] | |
class Miss: | |
baseurl = 'http://missphoto.nsu.ru/2010/' | |
def __init__(self, imgdir = 'img'): | |
self.imgdir = imgdir | |
def download(self): | |
f = urllib.urlopen(self.baseurl + 'list.php'); | |
main = f.read() | |
f.close() | |
id_parser = IDParser() | |
ids = id_parser.parse(main) | |
maxid = max(ids) | |
if os.path.exists(self.imgdir): | |
shutil.rmtree(self.imgdir) | |
for id in sorted(ids): | |
print "%i/%i\t" % (id, maxid), | |
self.__parse_id(id) | |
def __parse_id(self, id): | |
f = urllib.urlopen(self.baseurl + 'view.php?n=%s' % id ); | |
page = f.read() | |
f.close() | |
name_parser = NameParser() | |
name, latin = name_parser.parse(page) | |
uname = name.decode('cp1251') | |
print "%-*s \t %s" % (15, latin, uname), | |
os.makedirs(self.imgdir + "/" + "%s" % uname) | |
for i in range(1, 4): | |
file = '%s-%s.jpg' % (latin, i) | |
img = urllib.urlopen(self.baseurl + 'foto/' + file) | |
img_file = open(self.imgdir + "/" + uname + "/" + file, 'w+') | |
img_file.write(img.read()) | |
img.close() | |
img_file.close() | |
if __name__ == "__main__": | |
Miss().download() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment