Instantly share code, notes, and snippets.

Embed
What would you like to do?
Downloader for Google Web Fonts
#!/usr/bin/env python3
#
# Downloader for Google Web Fonts
#
# For usage information run with "--help"
#
# Works on Python 2.6 and later, 3 and later
# Requires tinycss (and argparse for Python 2.6) from pip
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# Copyright 2012 Kevin Locke <kevin@kevinlocke.name>
from __future__ import with_statement
import argparse
import collections
import contextlib
import errno
import gzip
import io
import itertools
import logging
import os
import re
import shutil
import sys
import tinycss
try:
from httplib import HTTPConnection, HTTPSConnection
except ImportError:
from http.client import HTTPConnection, HTTPSConnection
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
__version__ = '0.1.0'
# ADT for font positional command-line arguments
FontArgument = collections.namedtuple("FontArgument", ["family", "variants"])
# ADT for download information
DownloadInfo = collections.namedtuple("DownloadItem", ["url", "filename"])
# Default HTTP User-Agent string
default_user_agent = \
"DL4GoogleWebFonts/" + __version__
# Mapping from font format to file extension
fontfmt_extensions = {
"embedded-opentype": "eot",
"opentype": "ttf",
"svg": "svg",
"truetype": "ttf",
"woff": "woff",
"woff2": "woff2",
}
# Mapping from font format to User-Agent string required to get the format
fontfmt_user_agent = {
# EOT is served to IE 8-
# IE 8 on Windows 7
"embedded-opentype": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)",
# SVG is served to Safari Mobile 3-4
# Safari 3 on iPhone
"svg": "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3",
# TTF is served to Android 4-, Opera 11.01-, Safari Mobile 5+, non-Mobile Safari
"truetype": "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
# WOFF is served to Chrome, Firefox, Opera 11.10+
# Firefox 15 on Ubuntu
"woff": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1",
# WOFF2 is served to more recent Chrome, Firefox, Opera
# Firefox 43 on Ubuntu
# Note: Firefox 44 and later cause unicode-range to be used
# See https://stackoverflow.com/a/31455499
"woff2": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0",
}
# Font formats which require separate requests for each variant
# FIXME: This is probably UA-dependent. Switch to non-separate where possible
fontfmt_serialize = frozenset(["embedded-opentype", "svg"])
def setup_logging():
"""Initialize the global logger variable to a root logger for the console"""
global logger
formatter = logging.Formatter("%(levelname)s: %(message)s")
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.addHandler(handler)
setup_logging()
# TODO: Replace with urllib3/Request (want to limit dependencies...)
class ConnectionPool(object):
"""A very simple and naive connection pool for HTTP/HTTPS"""
_http_connections = {}
_https_connections = {}
def close(self, proto, host):
if proto == "http":
conn = self._http_connections.pop(host, None)
elif proto == "https":
conn = self._https_connections.pop(host, None)
else:
raise ValueError("Unsupported protocol")
if conn:
conn.close()
def close_all(self):
conns = itertools.chain(
self._http_connections.values(),
self._https_connections.values()
)
for conn in conns:
conn.close()
self._http_connections.clear()
self._https_connections.clear()
def get(self, proto, host):
if proto == "http":
if host not in self._http_connections:
self._http_connections[host] = HTTPConnection(host)
return self._http_connections[host]
elif proto == "https":
if host not in self._https_connections:
self._https_connections[host] = HTTPSConnection(host)
return self._https_connections[host]
else:
raise ValueError("Unsupported protocol")
# Shared global connection pool
connection_pool = ConnectionPool()
class FontFaceRule(object):
"""A parsed at-rule for declaring a font-face."""
def __init__(self, at_keyword, declarations, line, column):
self.at_keyword = at_keyword
self.declarations = declarations
self.line = line
self.column = column
class CSSFontFace3Parser(tinycss.css21.CSS21Parser):
"""A CSS parser which recognizes @font-face rules."""
def parse_at_rule(self, rule, previous_rules, errors, context):
if rule.at_keyword == "@font-face":
if rule.head:
raise tinycss.css21.ParseError(rule.head[0],
"Unexpected token {0} in {1} rule header".format(
rule.head[0].type, rule.at_keyword))
declarations, body_errors = self.parse_declaration_list(rule.body)
errors.extend(body_errors)
return FontFaceRule(rule.at_keyword, declarations,
rule.line, rule.column)
return super(CSSFontFace3Parser, self).parse_at_rule(rule,
previous_rules, errors, context)
# FIXME: Should return HTTPResponse wrapper which handles decoding
def decode_response(response):
"""Returns a file-like object of the content data in an HTTPResponse"""
encoding = response.getheader("Content-Encoding")
if encoding == "gzip":
if sys.version_info < (3,2):
gzipdata = io.BytesIO(response.read())
responsedata = gzip.GzipFile(fileobj=gzipdata)
else:
responsedata = gzip.GzipFile(fileobj=response)
elif encoding == "identity" or not encoding:
responsedata = response
else:
raise RuntimeError("Server used unsupported content encoding '{0}'".format(encoding))
return responsedata
def download_file(url, filename):
"""
Downloads a given URL and save it with a given filename if that file
does not exist
"""
try:
fd = os.open(filename, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o666)
except OSError as e:
if e.errno == errno.EEXIST:
logger.warn("File '%s' already exists, skipping", filename)
else:
logger.error("Unable to open '%s': %s", filename, e, exc_info=True)
return False
try:
with os.fdopen(fd, "wb") as outfile:
logger.info("Downloading '%s' as '%s'", url, filename)
urlparts = urlparse.urlsplit(url)
urlpath = urlparts.path
if urlparts.query:
urlpath += "?" + urlparts.query
conn = connection_pool.get(urlparts.scheme, urlparts.netloc)
headers = {
"Accept-Encoding": "gzip",
"Connection": "keep-alive",
"User-Agent": default_user_agent,
}
conn.request(method="GET", url=urlpath, headers=headers)
response = conn.getresponse()
status_ok = response.status // 100 == 2
logger.log(
logging.DEBUG if status_ok else logging.ERROR,
"Server returned status %d (%s) for %s",
response.status, response.reason, url)
if not status_ok:
# Discard response body
response.read()
return False
responsedata = decode_response(response)
shutil.copyfileobj(responsedata, outfile)
except Exception as exc:
logger.error("Error downloading %s: %s", url, exc, exc_info=True)
os.unlink(filename)
return False
return True
def choose_font_name(names):
"""Choose the "best" filename for a font from a given set"""
safe = [n for n in names if "/" not in n]
remaining = [n for n in names if " " not in n]
if len(remaining) == 0:
# Try other heuristics
remaining = safe
if len(remaining) > 1:
# Return the longest
choice = reduce(lambda m,n: m if len(m) > len(n) else n, remaining)
else:
choice = remaining[0]
logger.debug("Chose name '{0}' from {1}".format(choice, names))
return choice
def extract_font_names(srctokens):
"""Returns any local font names from a list of CSS tokens"""
names = []
for token in srctokens:
if token.type == "FUNCTION" and token.function_name == "local":
# Content can be quoted or unquoted
if len(token.content) == 1 and token.content[0].type == "STRING":
names.append(token.content[0].value)
else:
names.append("".join([c.as_css() for c in token.content]))
return names
def extract_font_urls(fontfmt, srctokens):
"""Returns any URLs matching a given format in a given list of CSS tokens"""
url = None # Last URL token parsed (cleared by non-S token)
urls = [] # All URLs parsed
for token in srctokens:
if token.type == "URI":
url = token.value
elif token.type == "FUNCTION" and token.function_name == "format":
if not url:
logger.warn("CSS warning: Ignoring format() without associated url()")
else:
# CSS3 spec says format can be list of format strings
# FIXME: Should warn about non-STRING tokens and 0 STRINGs
urlfontfmts = [t.value for t in token.content if t.type == "STRING"]
if fontfmt in urlfontfmts:
urls.append(url)
else:
logger.debug("Ignoring URL {0} with format({1}) while fetching format({2})".format(
url, "".join([c.as_css() for c in token.content]), fontfmt))
url = None
elif token.type == "FUNCTION" and token.function_name == "local":
# Ignore local name here
pass
elif token.type == "S":
# Ignore space
pass
else:
if url:
logger.debug("Ignoring URL without format(): {0}".format(url))
url = None
if token.type != "DELIM" or token.value != ",":
logger.warn("CSS warning: Ignoring unexpected token {0}".format(token))
return urls
def extract_font_downloads(fontfmt, rule):
"""Returns any font downloads for the specified format in the given CSS rule"""
names = []
urls = []
for declaration in rule.declarations:
if declaration.name == "src":
names.extend(extract_font_names(declaration.value))
urls.extend(extract_font_urls(fontfmt, declaration.value))
if urls:
if not names:
name = urls[0].rsplit("/", 1)[-1].rsplit(".", 1)[0]
logger.warn("No name found for {0}, using name from URL".format(urls[0]))
else:
name = choose_font_name(names)
# Ensure urls are unique
urls = set(urls)
if len(urls) > 1:
logger.warn("Ignoring additional URLs for same format: {0}".format(urls))
fontfmt_ext = fontfmt_extensions[fontfmt]
url = urls.pop()
ext = urlparse.urlsplit(url).path.rsplit(".", 1)[-1]
if "/" in ext:
logger.debug("No extension for '{0}', using '{1}' from format".format(url, fontfmt_ext))
ext = fontfmt_ext
elif ext != fontfmt_ext:
logger.warn("URL extension '{0}' does not match format extension '{1}'".format(ext, fontfmt_ext))
downloads = [ DownloadInfo(url=url,filename=name+"."+ext) ]
else:
logger.warn("Ignoring @font-face without src")
downloads = []
return downloads
def fetch_fonts_from_css(fontfmt, stylesheet):
"""Downloads any fonts for the given format in the given CSS stylesheet"""
downloads=[]
haveff = False
for rule in stylesheet.rules:
if rule.at_keyword == "@font-face":
haveff = True
downloads.extend(extract_font_downloads(fontfmt, rule))
downloadcnt = 0
if not haveff:
logger.warn("No @font-face rules found in stylesheet")
else:
for download in downloads:
if download_file(download.url, download.filename):
downloadcnt += 1
return downloadcnt
def make_css_path(subsets, fonts):
"""Returns the path to a CSS file for the given subsets and fonts"""
url = "/css?family="
families = []
for font in fonts:
family = font.family.replace(" ", "+")
if font.variants:
family += ":" + ",".join(font.variants)
families.append(family)
url += "|".join(families)
if subsets:
url += "&subset=" + ",".join(subsets)
return url
def fetch_css(fontfmt, subsets, fonts):
"""Downloads CSS files with the given formats, subsets, and fonts"""
path = make_css_path(subsets, fonts)
user_agent = fontfmt_user_agent[fontfmt]
headers = {
"Accept": "text/css",
"Accept-Encoding": "gzip",
"Connection": "keep-alive",
"User-Agent": user_agent,
}
logger.info("Downloading {0} for {1} format".format(path, fontfmt))
conn = connection_pool.get("http", "fonts.googleapis.com")
conn.request(method="GET", url=path, headers=headers)
return conn.getresponse()
def parse_css(response):
"""Converts a CSS HTTPResponse into a tinycss stylesheet"""
content_type = response.getheader("Content-Type", "text/css")
css_charset_re = "text/css\s*;\s*charset\s*=\s*([^\s;]+)\s*(?:;|$)"
css_charset_match = re.match(css_charset_re, content_type, re.I)
charset = css_charset_match.group(1) if css_charset_match else None
parser = tinycss.make_parser(CSSFontFace3Parser)
cssdata = decode_response(response)
logger.debug("Parsing CSS response with charset '{0}'".format(charset))
return parser.parse_stylesheet_bytes(cssdata.read(), charset)
def fetch_fonts_format(fontfmt, subsets, fonts):
"""Downloads font files for a given format, subsets, and fonts"""
response = fetch_css(fontfmt, subsets, fonts)
if response.status // 100 != 2:
logger.error(
"Server returned status %d (%s) for CSS file. Incorrect font name?",
response.status, response.reason)
# Discard response body
response.read()
return 0
stylesheet = parse_css(response)
return fetch_fonts_from_css(fontfmt, stylesheet)
def fetch_fonts(fontfmts, subsets, fonts):
"""Downloads font files for the given formats, subsets, and fonts"""
downloadcnt = 0
for fontfmt in fontfmts:
if fontfmt in fontfmt_serialize:
for i in itertools.count():
# A list of fonts with variant i of their list
fonts1v = []
for font in fonts:
# Note: Include empty variant on first pass, if empty
if len(font.variants) > i or (i == 0 and len(font.variants) == 0):
fonts1v.append(font._replace(variants=font.variants[i:i+1]))
if not fonts1v:
# All variants of all fonts have been fetched
break
downloadcnt += fetch_fonts_format(fontfmt, subsets, fonts1v)
else:
downloadcnt += fetch_fonts_format(fontfmt, subsets, fonts)
return downloadcnt
def parse_font_arg(arg):
"""Parses a command-line argument into a FontArgument"""
if ":" in arg:
family, variants = arg.split(":", 1)
if "," in variants:
variants = variants.split(",")
else:
variants = [ variants ]
else:
family = arg
variants = []
return FontArgument(family=family, variants=variants)
def main(*argv):
parser = argparse.ArgumentParser(description="Download Google Web Fonts")
parser.add_argument(
'-f', '--format', action="append", help="Format to download (may appear multiple times)", choices=sorted(fontfmt_user_agent.keys()))
parser.add_argument(
'-q', '--quiet', action="count", help="Decrease verbosity (make quieter)")
parser.add_argument(
'-s', '--subset', action="append", help="Subset to download (may appear multiple times)")
parser.add_argument(
'-v', '--verbose', action="count", help="Increase verbosity")
parser.add_argument(
'-V', '--version', action="version",
version="%(prog)s " + __version__)
parser.add_argument(
'font', nargs="+", type=parse_font_arg,
help="Font to download (in same format as CSS URL)")
args = parser.parse_args(args=argv[1:])
# By default, download all formats
if not args.format:
args.format = fontfmt_user_agent.keys()
# Set log level based on verbosity requested (default of INFO)
verbosity = (args.quiet or 0) - (args.verbose or 0)
logger.setLevel(logging.INFO + verbosity * 10)
try:
fetched = fetch_fonts(frozenset(args.format), args.subset, args.font)
logger.info("Finished downloading {0} font files".format(fetched))
return 0
except Exception as e:
logger.error("Unexpected internal error: {0}".format(str(e)), exc_info=True)
return 1
finally:
try:
connection_pool.close_all()
except Exception as e:
pass
if __name__ == "__main__":
sys.exit(main(*sys.argv))
@fedir

This comment has been minimized.

fedir commented Mar 6, 2013

Cool ! Thanks for sharing !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment