Skip to content

Instantly share code, notes, and snippets.

@luxinyan
Last active August 29, 2015 14:07
Show Gist options
  • Save luxinyan/fa4c874e02389bb6fa08 to your computer and use it in GitHub Desktop.
Save luxinyan/fa4c874e02389bb6fa08 to your computer and use it in GitHub Desktop.
import re
import urllib2
import urllib
import sys
import time
#get html of page
def getHtml(url):
page = urllib2.urlopen(url)
html = page.read()
return html
#get url of image
def getImg(html):
re_rule = r'<img src="(.+?sinaimg.+?.jpg)" />'
imgre = re.compile(re_rule)
imglist = re.findall(imgre, html)
for img in imglist:
time_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
try:
urllib.urlretrieve(img,
r'/Users/alex/Desktop/pic/{0}.jpg'.format(time_name))
except:
pass
def download(url):
html = getHtml(url)
getImg(html)
url = sys.argv[1]
begin_page = int(sys.argv[2])
end_page = int(sys.argv[3])
for i in range(end_page):
try:
page = begin_page - i
page_str = 'page-%s' %page
download_url = url + page_str
download(download_url)
print download_url
download_url = url
except:
print 'Error', page
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment