Skip to content

Instantly share code, notes, and snippets.

@spiiin
Created March 23, 2015 23:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spiiin/ce2c43808d2d09fda361 to your computer and use it in GitHub Desktop.
Save spiiin/ce2c43808d2d09fda361 to your computer and use it in GitHub Desktop.
import sys
import re
import urllib
import urllib2
import os.path
import md5
import threading
import time
import vk_param
max_threads = 10 #threads count
md = md5.new()
md.update(vk_param.password)
_hash = md.hexdigest()
_mail = vk_param.email.replace("@","%40")
_id = vk_param.your_id
cookie_string ="remixpass="+_hash+";\
remixemail="+_mail+";\
remixmid="+str(_id);
if len(sys.argv)<3:
print "please enter correct arguments"
print "Format \"vk_get_pictures ALBUM_ID SAVE_PATH [START_INDEX]\""
sys.exit()
save_folder = sys.argv[2];
album_id = sys.argv[1]
if len(sys.argv)>=4:
start_ind = int(sys.argv[3])
else:
start_ind = 0
saved_start_ind = start_ind
re_real_url = re.compile(r"http://cs[0-9]+\.vkontakte\.ru/u[0-9]+/[0-9]+/[a-z]_.+\.jpg")
pic_templ = re.compile(r"photo[-]?[0-9]+_[0-9]+")
total_found = 0
found_image=999
#show size of html files
total_text= 0
def get_pic_addr(url):
req = urllib2.Request(url)
req.add_header("Cookie", cookie_string )
f = urllib2.urlopen(req)
text = f.read();
global total_text
total_text =total_text+len(text)
for s in text.splitlines():
match = re_real_url.search(s);
if (match and s.find('photo')!=-1):
return s[match.start():match.end()]
raise PicNotFoundError("Page not contain pictures")
class PicNotFoundError(Exception):
pass
class GetOnePic(threading.Thread):
def __init__(self,url,no):
self.url=url
self.no=no
threading.Thread.__init__(self)
def run(self):
print str.rjust(str(self.no),3),'*'*5," search begin"
try:
#get real picture url from page with it
pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
except PicNotFoundError:
print "This page without picture"
except:
print "Connection error"
else:
#if pictures found
file_name = str(self.no)+pic_url[pic_url.find("_"):]
full_file_name = os.path.join(save_folder,file_name)
if not os.path.exists(full_file_name):
#if file not exists yet
try:
#try to get it from servers
urllib.urlretrieve(pic_url,full_file_name)
except IOError:
print "Error while try saving file ", pic_url," to", full_file_name
else:
#all OK
print str.rjust(str(self.no),3),'*'*5,pic_url
else:
#if file already downloaded
print str.rjust(str(self.no),3),'*'*5,pic_url, "(exists)"
################################################################
print "Connect..."
while found_image>0:
params = urllib.urlencode({"id" : album_id, "st" : start_ind})
req = urllib2.Request("http://vkontakte.ru/photos.php?act=album&",params)
req.add_header("Cookie", cookie_string )
try:
f = urllib2.urlopen(req)
except urllib2.URLError:
sys.exit("Connection error")
alb_page_text = f.read()
print "Album main page reading..."
print "read", len(alb_page_text), "bytes"
total_text = total_text + len(alb_page_text)
found_image=0
for cur_line in alb_page_text.splitlines():
match_obj = pic_templ.search(cur_line)
if (match_obj):
pic_page = cur_line[match_obj.start():match_obj.end()]
pic_in_album = saved_start_ind+total_found
#pic_url = get_pic_addr("http://vkontakte.ru/"+pic_page)
#file_name = str(pic_in_album)+pic_url[pic_url.find("_"):]
#urllib.urlretrieve(pic_url,os.path.join(save_folder,file_name))
#print str.rjust(str(pic_in_album),3),'*'*5,pic_url
#multiloading
while threading.activeCount()>max_threads:
time.sleep(2.5)
GetOnePic(pic_page,pic_in_album).start()
found_image = found_image+1
total_found = total_found+1
start_ind+=20
#wait,while all threads stop
while threading.activeCount()>1:
pass
print "Founded:",total_found,"pictures"
print "Download:",total_text,"bytes of text"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment