Skip to content

Instantly share code, notes, and snippets.

@pilt
Created June 20, 2012 09:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pilt/2959085 to your computer and use it in GitHub Desktop.
Save pilt/2959085 to your computer and use it in GitHub Desktop.
Scrape aws.amazon.com for Amazon Linux AMI ids
{('ebs', 'cluster_compute', 'us-east-1'): 'ami-e965ba80',
('ebs', 'cluster_gpu', 'us-east-1'): 'ami-fd65ba94',
('ebs', 'i386', 'ap-northeast-1'): 'ami-087acb09',
('ebs', 'i386', 'ap-southeast-1'): 'ami-b83374ea',
('ebs', 'i386', 'eu-west-1'): 'ami-fd231b89',
('ebs', 'i386', 'sa-east-1'): 'ami-aa855bb7',
('ebs', 'i386', 'us-east-1'): 'ami-ed65ba84',
('ebs', 'i386', 'us-west-1'): 'ami-978cd4d2',
('ebs', 'i386', 'us-west-2'): 'ami-38c64a08',
('ebs', 'x86_64', 'ap-northeast-1'): 'ami-e47acbe5',
('ebs', 'x86_64', 'ap-southeast-1'): 'ami-be3374ec',
('ebs', 'x86_64', 'eu-west-1'): 'ami-f9231b8d',
('ebs', 'x86_64', 'sa-east-1'): 'ami-a6855bbb',
('ebs', 'x86_64', 'us-east-1'): 'ami-e565ba8c',
('ebs', 'x86_64', 'us-west-1'): 'ami-e78cd4a2',
('ebs', 'x86_64', 'us-west-2'): 'ami-3ac64a0a',
('instance', 'i386', 'ap-northeast-1'): 'ami-087bca09',
('instance', 'i386', 'ap-southeast-1'): 'ami-b43374e6',
('instance', 'i386', 'eu-west-1'): 'ami-fb231b8f',
('instance', 'i386', 'sa-east-1'): 'ami-a8855bb5',
('instance', 'i386', 'us-east-1'): 'ami-db65bab2',
('instance', 'i386', 'us-west-1'): 'ami-e58cd4a0',
('instance', 'i386', 'us-west-2'): 'ami-36c64a06',
('instance', 'x86_64', 'ap-northeast-1'): 'ami-047bca05',
('instance', 'x86_64', 'ap-southeast-1'): 'ami-b23374e0',
('instance', 'x86_64', 'eu-west-1'): 'ami-ff231b8b',
('instance', 'x86_64', 'sa-east-1'): 'ami-ae855bb3',
('instance', 'x86_64', 'us-east-1'): 'ami-f565ba9c',
('instance', 'x86_64', 'us-west-1'): 'ami-d98cd49c',
('instance', 'x86_64', 'us-west-2'): 'ami-30c64a00'}
import re
import requests
from pyquery import PyQuery
def amazon_linux_images():
"""Get AMIs for Amazon Linux.
Keys are three-tuples like ``(STORAGE, ARCH, REGION)`` and values are AMI
ids.
Example::
{('ebs', 'cluster_compute', 'us-east-1'): 'ami-e965ba80',
('ebs', 'cluster_gpu', 'us-east-1'): 'ami-fd65ba94',
('ebs', 'i386', 'ap-northeast-1'): 'ami-087acb09',
...
('instance', 'x86_64', 'us-west-1'): 'ami-d98cd49c',
('instance', 'x86_64', 'us-west-2'): 'ami-30c64a00'}
AMIs are fetched remotely by scraping aws.amazon.com.
"""
# Create PyQuery object to work with.
scrape_url = "http://aws.amazon.com/amazon-linux-ami/"
headers = {
"Accept-Language": "en-us",
}
r = requests.get(scrape_url, headers=headers)
r.raise_for_status()
pq = PyQuery(r.content)
# Get table with AMIs.
tables = pq("table")
assert len(tables) == 1, "found more than one table"
table = PyQuery(tables[0])
rows = table.find("tr")
header_row, ami_rows = rows[0], rows[1:]
# Verify that the header row looks as expected.
headers = [h.text_content() for h in header_row.findall("td")]
headers_should_be = [
"Region",
"EBS-Backed32-bit",
"EBS-Backed64-bit",
"Instance Store32-bit",
"Instance Store64-bit",
"Cluster ComputeEBS-Backed64-bit",
"Cluster GPUEBS-Backed64-bit"]
assert len(headers) == len(headers_should_be), "wrong number of cells"
for actual, should_be in zip(headers, headers_should_be):
assert actual == should_be
url_regex = re.compile(r".*region=(.+)#launchAmi=(.+)")
index_info = [
None,
("ebs", "i386"),
("ebs", "x86_64"),
("instance", "i386"),
("instance", "x86_64"),
("ebs", "cluster_compute"),
("ebs", "cluster_gpu"),
]
amis = {}
for row in [r.findall("td") for r in ami_rows]:
for i, a_elem in enumerate([u.find("a") for u in row]):
info = index_info[i]
if not info or a_elem is None:
continue
groups = url_regex.match(a_elem.get("href")).groups()
amis[info + (groups[0],)] = groups[1]
return amis
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment