Skip to content

Instantly share code, notes, and snippets.

@ThomasG77
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ThomasG77/4a2a21c09769d29597bb to your computer and use it in GitHub Desktop.
Save ThomasG77/4a2a21c09769d29597bb to your computer and use it in GitHub Desktop.
# -*- coding: utf8 -*-
import lxml.html
from lxml.cssselect import CSSSelector
# get some html
import requests
def scrapeAirport(page):
r = requests.get('http://aeronav.faa.gov/new_afd.asp?effDate=24JUL2014&eff=07-24-2014&end=09-18-2014&search=&submit1=Search&select=&StateSearch=ALL&CitySearch=&ChartSearch=&volsearch=&navaidSearch=&page=%s#results' % page)
# build the DOM Tree
tree = lxml.html.fromstring(r.text)
# print the parsed DOM Tree (Optionnal for our purpose)
#print lxml.html.tostring(tree)
# construct a CSS Selector
sel = CSSSelector('.bordered a')
# Apply the selector to the DOM tree.
results = sel(tree)
# Get the url
print "\n".join(['http://aeronav.faa.gov' + t.get('href') for t in results])
# If you play with table instead
# construct a CSS Selector
sel = CSSSelector('.bordered')
# Apply the selector to the DOM tree.
results = sel(tree)
# Transform and join the table so you can also keep relation between raw links and other content
#print "\n".join([lxml.html.tostring(t) for t in results])
for indice in xrange(1, 235):
scrapeAirport(indice)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment