Skip to content

Instantly share code, notes, and snippets.

@moskomule
Created February 22, 2017 07:21
Show Gist options
  • Save moskomule/e50a09e7d505b8d883ae1abf5c7a8a23 to your computer and use it in GitHub Desktop.
Save moskomule/e50a09e7d505b8d883ae1abf5c7a8a23 to your computer and use it in GitHub Desktop.
save images in specifeied pages
from html.parser import HTMLParser
import urllib.request
import re
def download(url, path):
with urllib.request.urlopen(url) as file:
file_name = path + "/" + url.split("/")[-1]
with open(file_name, 'wb') as local:
local.write(file.read())
class TestParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.url = ""
def handle_starttag(self, tag, attrs):
if tag == "link":
attrs = dict(attrs)
if attrs["rel"] == "image_src":
download(attrs["href"], path)
def handle_endtag(self, tag):
if self.url and re.match('^https', self.url):
self.url = ""
parser = TestParser()
with open(csv) as f:
urls = f.readlines()
for url in urls:
try:
with urllib.request.urlopen(url) as response:
page = response.read().decode('utf-8')
parser.feed(page)
parser.close()
except Exception:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment