spiiin/vk_get_pictures_2009.py

## vk_get_pictures_2009.py
import sys
import re
import urllib
import urllib2
import os.path
import md5
import threading
import time
import vk_param

max_threads  = 10              #threads count
md = md5.new()
md.update(vk_param.password)
_hash = md.hexdigest()
_mail = vk_param.email.replace("@","%40")
_id   = vk_param.your_id
cookie_string ="remixpass="+_hash+";\
                remixemail="+_mail+";\
                remixmid="+str(_id);


if len(sys.argv)<3:
    print "please enter correct arguments"
    print "Format \"vk_get_pictures ALBUM_ID SAVE_PATH [START_INDEX]\""
    sys.exit()
save_folder = sys.argv[2];
album_id = sys.argv[1]
if len(sys.argv)>=4:
    start_ind = int(sys.argv[3])
else:
    start_ind = 0
saved_start_ind = start_ind

re_real_url = re.compile(r"http://cs[0-9]+\.vkontakte\.ru/u[0-9]+/[0-9]+/[a-z]_.+\.jpg")
pic_templ = re.compile(r"photo[-]?[0-9]+_[0-9]+")

total_found = 0
found_image=999

#show size of html files
total_text= 0

def get_pic_addr(url):
    req = urllib2.Request(url)
    req.add_header("Cookie", cookie_string )
    f = urllib2.urlopen(req)
    text = f.read();
    global total_text
    total_text =total_text+len(text)
    for s in text.splitlines():
        match = re_real_url.search(s);
        if (match and s.find('photo')!=-1):
            return s[match.start():match.end()]
    raise PicNotFoundError("Page not contain pictures")

class PicNotFoundError(Exception):
    pass

class GetOnePic(threading.Thread):
    def __init__(self,url,no):
        self.url=url
        self.no=no
        threading.Thread.__init__(self)
    def run(self):
        print str.rjust(str(self.no),3),'*'*5,"  search begin"
        try:
          #get real picture url from page with it
          pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
        except PicNotFoundError:
          print "This page without picture"
        except:
          print "Connection error"
        else:
          #if pictures found
          file_name = str(self.no)+pic_url[pic_url.find("_"):]
          full_file_name = os.path.join(save_folder,file_name)
          if not os.path.exists(full_file_name):
              #if file not exists yet
              try:
                #try to get it from servers
                urllib.urlretrieve(pic_url,full_file_name)
              except IOError:
                print "Error while try saving file ", pic_url," to", full_file_name
              else:
                #all OK
                print str.rjust(str(self.no),3),'*'*5,pic_url
          else:
              #if file already downloaded
              print str.rjust(str(self.no),3),'*'*5,pic_url, "(exists)"


################################################################
print "Connect..."
while found_image>0:

    params = urllib.urlencode({"id" : album_id, "st" : start_ind})
    req = urllib2.Request("http://vkontakte.ru/photos.php?act=album&",params)
    req.add_header("Cookie", cookie_string )
    try:
        f = urllib2.urlopen(req)
    except urllib2.URLError:
        sys.exit("Connection error")
    alb_page_text = f.read()
    print "Album main page reading..."
    print "read", len(alb_page_text), "bytes"
    total_text = total_text + len(alb_page_text)

    found_image=0
    for cur_line in alb_page_text.splitlines():
        match_obj = pic_templ.search(cur_line)
        if (match_obj):
            pic_page = cur_line[match_obj.start():match_obj.end()]
            pic_in_album = saved_start_ind+total_found

            #pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
            #file_name = str(pic_in_album)+pic_url[pic_url.find("_"):]
            #urllib.urlretrieve(pic_url,os.path.join(save_folder,file_name))
            #print str.rjust(str(pic_in_album),3),'*'*5,pic_url

            #multiloading
            while threading.activeCount()>max_threads:
                time.sleep(2.5)
            GetOnePic(pic_page,pic_in_album).start()

            found_image = found_image+1
            total_found = total_found+1
    start_ind+=20

#wait,while all threads stop
while threading.activeCount()>1:
    pass

print "Founded:",total_found,"pictures"
print "Download:",total_text,"bytes of text"
	import sys
	import re
	import urllib
	import urllib2
	import os.path
	import md5
	import threading
	import time
	import vk_param

	max_threads = 10 #threads count
	md = md5.new()
	md.update(vk_param.password)
	_hash = md.hexdigest()
	_mail = vk_param.email.replace("@","%40")
	_id = vk_param.your_id
	cookie_string ="remixpass="+_hash+";\
	remixemail="+_mail+";\
	remixmid="+str(_id);


	if len(sys.argv)<3:
	print "please enter correct arguments"
	print "Format \"vk_get_pictures ALBUM_ID SAVE_PATH [START_INDEX]\""
	sys.exit()
	save_folder = sys.argv[2];
	album_id = sys.argv[1]
	if len(sys.argv)>=4:
	start_ind = int(sys.argv[3])
	else:
	start_ind = 0
	saved_start_ind = start_ind

	re_real_url = re.compile(r"http://cs[0-9]+\.vkontakte\.ru/u[0-9]+/[0-9]+/[a-z]_.+\.jpg")
	pic_templ = re.compile(r"photo[-]?[0-9]+_[0-9]+")

	total_found = 0
	found_image=999

	#show size of html files
	total_text= 0

	def get_pic_addr(url):
	req = urllib2.Request(url)
	req.add_header("Cookie", cookie_string )
	f = urllib2.urlopen(req)
	text = f.read();
	global total_text
	total_text =total_text+len(text)
	for s in text.splitlines():
	match = re_real_url.search(s);
	if (match and s.find('photo')!=-1):
	return s[match.start():match.end()]
	raise PicNotFoundError("Page not contain pictures")

	class PicNotFoundError(Exception):
	pass

	class GetOnePic(threading.Thread):
	def __init__(self,url,no):
	self.url=url
	self.no=no
	threading.Thread.__init__(self)
	def run(self):
	print str.rjust(str(self.no),3),''5," search begin"
	try:
	#get real picture url from page with it
	pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
	except PicNotFoundError:
	print "This page without picture"
	except:
	print "Connection error"
	else:
	#if pictures found
	file_name = str(self.no)+pic_url[pic_url.find("_"):]
	full_file_name = os.path.join(save_folder,file_name)
	if not os.path.exists(full_file_name):
	#if file not exists yet
	try:
	#try to get it from servers
	urllib.urlretrieve(pic_url,full_file_name)
	except IOError:
	print "Error while try saving file ", pic_url," to", full_file_name
	else:
	#all OK
	print str.rjust(str(self.no),3),''5,pic_url
	else:
	#if file already downloaded
	print str.rjust(str(self.no),3),''5,pic_url, "(exists)"




	################################################################
	print "Connect..."
	while found_image>0:

	params = urllib.urlencode({"id" : album_id, "st" : start_ind})
	req = urllib2.Request("http://vkontakte.ru/photos.php?act=album&",params)
	req.add_header("Cookie", cookie_string )
	try:
	f = urllib2.urlopen(req)
	except urllib2.URLError:
	sys.exit("Connection error")
	alb_page_text = f.read()
	print "Album main page reading..."
	print "read", len(alb_page_text), "bytes"
	total_text = total_text + len(alb_page_text)

	found_image=0
	for cur_line in alb_page_text.splitlines():
	match_obj = pic_templ.search(cur_line)
	if (match_obj):
	pic_page = cur_line[match_obj.start():match_obj.end()]
	pic_in_album = saved_start_ind+total_found

	#pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
	#file_name = str(pic_in_album)+pic_url[pic_url.find("_"):]
	#urllib.urlretrieve(pic_url,os.path.join(save_folder,file_name))
	#print str.rjust(str(pic_in_album),3),''5,pic_url

	#multiloading
	while threading.activeCount()>max_threads:
	time.sleep(2.5)
	GetOnePic(pic_page,pic_in_album).start()

	found_image = found_image+1
	total_found = total_found+1
	start_ind+=20

	#wait,while all threads stop
	while threading.activeCount()>1:
	pass

	print "Founded:",total_found,"pictures"
	print "Download:",total_text,"bytes of text"