Skip to content

Instantly share code, notes, and snippets.

@tofumatt
Created June 14, 2011 22:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tofumatt/0d22ab43994e5f536acb to your computer and use it in GitHub Desktop.
Save tofumatt/0d22ab43994e5f536acb to your computer and use it in GitHub Desktop.
Nightly Scrape
#!/usr/bin/python
from optparse import OptionParser
import os
from sgmllib import SGMLParser
import shutil
import simplejson as json
import urllib2
path_to_this_script = os.path.realpath(__file__)
usage_example = "%prog --output-dir=/tmp/path/example"
# mod_autoindex generated HTML containing builds:
apache_query_string = "?C=M;O=D"
PAGES = [
# Desktop Site
{
'base_url': 'http://ftp.mozilla.org/pub/mozilla.org/',
'builds': [
{
'css_class': 'windows exe',
'extension': "exe",
'name': 'Windows',
'suffix': '.win32.installer',
'url': 'firefox/nightly/latest-trunk/',
},
{
'css_class': 'windows x64 exe',
'extension': "exe",
'name': 'Windows 64-bit',
'suffix': '.win64-x86_64.installer',
'url': 'firefox/nightly/latest-trunk/',
},
{
'css_class': 'mac dmg',
'extension': "dmg",
'name': 'Mac',
'suffix': '.mac',
'url': 'firefox/nightly/latest-trunk/',
},
{
'css_class': 'linux bz2',
'extension': "tar.bz2",
'name': 'Linux Intel',
'suffix': '.linux-i686',
'url': 'firefox/nightly/latest-trunk/',
},
{
'css_class': 'linux bz2 x64',
'extension': "tar.bz2",
'name': 'Linux 64-bit Intel',
'suffix': '.linux-x86_64',
'url': 'firefox/nightly/latest-trunk/',
},
],
'file': 'index',
'more_url': 'http://ftp.mozilla.org/pub/mozilla.org/firefox/nightly/latest-trunk/',
'title': 'Firefox Nightly Builds',
},
# Mobile Site
{
'base_url': 'http://ftp.mozilla.org/pub/mozilla.org/mobile/nightly/',
'builds': [
{
'css_class': 'android',
'extension': "apk",
'name': 'Android',
'suffix': '.multi.eabi-arm',
'url': 'latest-mozilla-central-android/',
},
{
'css_class': 'maemo maemo-gtk',
'extension': "tar.bz2",
'name': 'Maemo GTK',
'suffix': '.multi.linux-gnueabi-arm',
'url': 'latest-mozilla-central-maemo5-gtk/',
},
{
'css_class': 'maemo maemo-qt',
'extension': "tar.bz2",
'name': 'Maemo QT',
'suffix': '.multi.linux-gnueabi-arm',
'url': 'latest-mozilla-central-maemo5-qt/',
},
{
'css_class': 'windows zip',
'extension': "zip",
'name': 'Windows',
'suffix': '.win32',
'url': 'latest-mozilla-central-win32/',
},
{
'css_class': 'mac dmg',
'extension': "dmg",
'name': 'Mac',
'suffix': '.mac',
'url': 'latest-mozilla-central-macosx/',
},
{
'css_class': 'linux bz2',
'extension': "tar.bz2",
'name': 'Linux Intel',
'suffix': '.linux-i686',
'url': 'latest-mozilla-central-linux/',
},
],
'file': 'mobile',
'more_url': 'http://ftp.mozilla.org/pub/mozilla.org/mobile/nightly/',
'title': 'Mobile Nightly Builds',
},
# JS Previews
{
'base_url': 'http://ftp.mozilla.org/pub/mozilla.org/',
'builds': [
{
'css_class': 'windows exe',
'extension': "exe",
'name': 'Windows',
'suffix': '.win32.installer',
'url': 'firefox/nightly/latest-tracemonkey/',
},
{
'css_class': 'windows x64 exe',
'extension': "exe",
'name': 'Windows 64-bit',
'suffix': '.win64-x86_64.installer',
'url': 'firefox/nightly/latest-tracemonkey/',
},
{
'css_class': 'mac dmg',
'extension': "dmg",
'name': 'Mac',
'suffix': '.mac',
'url': 'firefox/nightly/latest-tracemonkey/',
},
{
'css_class': 'linux bz2',
'extension': "tar.bz2",
'name': 'Linux Intel',
'suffix': '.linux-i686',
'url': 'firefox/nightly/latest-tracemonkey/',
},
{
'css_class': 'linux bz2 x64',
'extension': "tar.bz2",
'name': 'Linux 64-bit Intel',
'suffix': '.linux-x86_64',
'url': 'firefox/nightly/latest-tracemonkey/',
},
],
'file': 'js-preview',
'more_url': 'http://ftp.mozilla.org/pub/mozilla.org/firefox/nightly/latest-tracemonkey',
'title': 'Firefox JS Preview Builds',
},
]
class URLLister(SGMLParser):
"""
Extend SGML Parser to look through FTP listing pages on ftp.mozilla.org
and get relevant file URLs, datestamps, and sizes.
"""
def __init__(self, parse_url, name, build):
SGMLParser.__init__(self)
self.build = build
self.date = None
self.link = None
self.parse_url = parse_url
self.name = name
self.size = None
def reset(self):
SGMLParser.reset(self)
self.textData = ""
def start_td(self, attrs):
self.textData = ""
def end_td(self):
text = self.textData.strip()
if (self.textData.endswith(
'%s.%s' % (self.build['suffix'], self.build['extension']))):
self.link = self.parse_url + text
return
if (self.link):
if self.date == None:
self.date = text.split()[0]
elif self.size == None:
self.size = text
def handle_data(self, text):
self.textData += text
def buildJSON(builds):
"""
Take a list of builds and output JSON base on each build's dictionary
structure.
"""
output = []
for build in builds:
# We make a copy so only certain keys show up, and we use URL over
# link for historical reasons.
json_build = build.copy()
json_build['url'] = json_build['link']
del json_build['link']
output.append(json_build)
return json.dumps(output, indent=0)
def buildHTML(page, other_pages=None):
"""
Build and return an HTML string with all builds supplied, a "get more"
builds link, and the title of the page.
Also takes an optional "other_pages" argument to link to other build
pages.
"""
header = """
<!DOCTYPE html>
<html>
<head>
<title>Firefox Nightly Builds</title>
<link rel="stylesheet" type="text/css" href="http://www.mozilla.com/style/tignish/content.css" />
<link rel="stylesheet" type="text/css" href="nightly.css" />
</head>
<body>
<div id="main-feature">
<h1>%s</h1>
<p>These builds are for testing purposes only.</p>
</div>
<div id="builds">
<ul>\n""" % page['title']
footer_start = """
</ul>
<h3>Other Nightly Builds</h3>
<ul id="more-urls">"""
footer_end = """</ul>
<h3>Can't Find What You Want Here?</h3>
<p id="ftp">We have <a href="%s">more stuff</a> if you don't see what you're looking for.</p>
</div>
</body>
</html>""" % page['more_url']
builds = page['builds']
extension = ""
middle = ""
more_urls = ""
for build in builds:
if build['link'] == None:
continue
middle += '\n<li class="' + build['css_class'] + '"'
if build['extension'] != extension:
middle += ' style="clear: both;"'
extension = build['extension']
middle += '>\n'
middle += '<a href="' + build['link'] + '">'
middle += build['name']
middle += '</a>'
middle += '' + build['size'] + 'B'
middle += ' ' + build['extension']
middle += '<br>\n'
middle += '<small>Built on ' + build['date'] + '</small>\n'
middle += '</li>'
for p in other_pages:
# Don't link to this page.
if p['file'] == page['file']:
continue
more_urls += '<li><a href="%s">%s</a></li>' % (p['file'] + '.html',
p['title'])
return header + middle + footer_start + more_urls + footer_end
def copy_file(output_dir, fileName):
"""Helper function that copies a file to a new folder."""
resource_path = os.path.split(path_to_this_script)[0]
shutil.copyfile(os.path.join(resource_path, fileName),
os.path.join(output_dir, fileName))
def generate_build_files(page, output_path):
"""Generate an HTML file and JSON file using the page info provided."""
for build in page['builds']:
f = urllib2.urlopen(
page['base_url'] + build['url'] + apache_query_string)
parser = URLLister(page['base_url'] + build['url'], build['name'],
build)
parser.feed(f.read())
f.close()
parser.close()
build['date'] = parser.date
build['link'] = parser.link
build['size'] = parser.size
# Create the HTML page for this page.
write_output(output_path, '%s.html' % page['file'],
buildHTML(page, PAGES))
# Create the JSON file afterward.
write_output(output_path, '%s.json' % page['file'],
buildJSON(page['builds']))
def write_output(output_dir, filename, text):
"""Helper function that writes a string out to a file."""
f = open(os.path.join(output_dir, filename), 'w')
f.write(text)
f.close()
def main():
"""
Function run when script is run from the command line. Generates new pages
based on the PAGES variable.
"""
optparser = OptionParser(usage=usage_example)
optparser.add_option("--output-dir", action="store", dest="output_path",
help="[Required] specify the output directory")
(options, args) = optparser.parse_args()
if options.output_path is None:
optparser.error("You must specify --output-dir")
for page in PAGES:
generate_build_files(page, options.output_path)
copy_file(options.output_path, "android.png")
copy_file(options.output_path, "blueGradient.png")
copy_file(options.output_path, "maemo.png")
copy_file(options.output_path, "nightly.css")
copy_file(options.output_path, "nightly.png")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment