Skip to content

Instantly share code, notes, and snippets.

@andy722
Created February 7, 2011 21:52
Show Gist options
  • Save andy722/815310 to your computer and use it in GitHub Desktop.
Save andy722/815310 to your computer and use it in GitHub Desktop.
Downloads all photos from Miss Photo NSU-2010 web site (which is *really* unusable)
# -*- coding: utf-8 -*-
import urllib, sgmllib
import re
import os, shutil
class IDParser(sgmllib.SGMLParser):
'''Returns list of contestant IDs from main page'''
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.ids = []
def parse(self, s):
self.feed(s)
self.close()
return self.ids
def reset(self):
sgmllib.SGMLParser.reset(self)
self.ids = []
def start_a(self, attrs):
for k,v in attrs:
if k == "class" and v == "list":
self.__parse_href(attrs)
def __parse_href(self, attrs):
for k,v in attrs:
if k == "href":
id = re.findall(r"\?n=(\d+)\&sort=date", v)[0]
self.ids += [ int(id) ]
class NameParser(sgmllib.SGMLParser):
'''Returns name and latin alias from contestant's page'''
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.reset()
def reset(self):
sgmllib.SGMLParser.reset(self)
self.name = ""
self.latin = ""
self.in_name = 0
def parse(self, s):
self.feed(s)
self.close()
return self.name, self.latin
def start_h1(self, attrs):
for k,v in attrs:
if k == "class" and v == "name":
self.in_name = 1
def end_h1(self):
self.in_name = 0
def handle_data(self, data):
if self.in_name:
self.name = data
def start_img(self, attrs):
for k,v in attrs:
if k == "src":
self.latin = re.findall(r"foto/(.*?)-\d.jpg", v)[0]
class Miss:
baseurl = 'http://missphoto.nsu.ru/2010/'
def __init__(self, imgdir = 'img'):
self.imgdir = imgdir
def download(self):
f = urllib.urlopen(self.baseurl + 'list.php');
main = f.read()
f.close()
id_parser = IDParser()
ids = id_parser.parse(main)
maxid = max(ids)
if os.path.exists(self.imgdir):
shutil.rmtree(self.imgdir)
for id in sorted(ids):
print "%i/%i\t" % (id, maxid),
self.__parse_id(id)
def __parse_id(self, id):
f = urllib.urlopen(self.baseurl + 'view.php?n=%s' % id );
page = f.read()
f.close()
name_parser = NameParser()
name, latin = name_parser.parse(page)
uname = name.decode('cp1251')
print "%-*s \t %s" % (15, latin, uname),
os.makedirs(self.imgdir + "/" + "%s" % uname)
for i in range(1, 4):
file = '%s-%s.jpg' % (latin, i)
img = urllib.urlopen(self.baseurl + 'foto/' + file)
img_file = open(self.imgdir + "/" + uname + "/" + file, 'w+')
img_file.write(img.read())
img.close()
img_file.close()
print
if __name__ == "__main__":
Miss().download()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment