public
Created

A script to look for icons specified in link tags with rel="icon" and the sizes attribute, or with rel="apple-touch-icon"

  • Download Gist
iconscrape.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
#! /usr/bin/env python
import csv
import urllib2
import sgmllib
 
class LinkParser(sgmllib.SGMLParser):
def parse(self, s):
self.feed(s)
self.close()
 
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.icons = []
self.sizes_icon = False
self.apple_icon = False
 
def start_link(self, attribute_tuples):
# Make a dictionary of attributes for easier look-up.
attributes = {}
for name, value in attribute_tuples:
attributes[name] = value;
 
# If we don't have an href, we won't have a useful icon.
if "href" not in attributes:
return
 
# There can be more than one keyword in the rel attribtue value.
rel_value = attributes["rel"].split(" ")
if "icon" in rel_value and "sizes" in attributes:
print "----- ", attributes["sizes"], attributes["href"]
self.sizes_icon = True
self.icons.append({"sizes": attributes["sizes"], "href": attributes["href"]})
 
if "apple-touch-icon" in rel_value or "apple-touch-icon-precomposed" in rel_value:
print "----- apple-touch-icon", attributes["href"]
self.apple_icon = True
self.icons.append({"apple-touch-icon": "true", "href": attributes["href"]})
 
def get_icons(self):
return self.icons
 
def has_sizes_icon(self):
return self.sizes_icon
 
def has_apple_icon(self):
return self.apple_icon
 
# site_url: [{ "sizes": sizes, "href": href }, ... ]
sites_to_icons = {}
sizes_icon_count = 0
apple_icon_count = 0
url_count = 0
 
# urls.csv is a comma-separated list on urls on a single row
reader = csv.reader(open('urls.csv', 'rb'))
urls = reader.next()
 
opener = urllib2.build_opener()
# Un-comment this line to test with an iPhone user agent
# opener.addheaders = [("User-agent", "Apple-iPhone3C1/801.306")]
 
for url in urls:
print "fetching", url
try:
f = opener.open(url)
s = f.read()
f.close()
except:
print "ERROR fetching", url
 
print "parsing", url
parser = LinkParser()
try:
parser.parse(s)
except:
print "ERROR parsing", url
 
sites_to_icons[url] = parser.get_icons()
if parser.has_sizes_icon():
sizes_icon_count = sizes_icon_count + 1
if parser.has_apple_icon():
apple_icon_count = apple_icon_count + 1
url_count = url_count + 1
 
print "sizes_icon_count =", sizes_icon_count, "apple_icon_count =", apple_icon_count, "url_count=", url_count
 
print sites_to_icons

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.