Last active
August 29, 2015 14:05
-
-
Save ThomasG77/4a2a21c09769d29597bb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import lxml.html | |
from lxml.cssselect import CSSSelector | |
# get some html | |
import requests | |
def scrapeAirport(page): | |
r = requests.get('http://aeronav.faa.gov/new_afd.asp?effDate=24JUL2014&eff=07-24-2014&end=09-18-2014&search=&submit1=Search&select=&StateSearch=ALL&CitySearch=&ChartSearch=&volsearch=&navaidSearch=&page=%s#results' % page) | |
# build the DOM Tree | |
tree = lxml.html.fromstring(r.text) | |
# print the parsed DOM Tree (Optionnal for our purpose) | |
#print lxml.html.tostring(tree) | |
# construct a CSS Selector | |
sel = CSSSelector('.bordered a') | |
# Apply the selector to the DOM tree. | |
results = sel(tree) | |
# Get the url | |
print "\n".join(['http://aeronav.faa.gov' + t.get('href') for t in results]) | |
# If you play with table instead | |
# construct a CSS Selector | |
sel = CSSSelector('.bordered') | |
# Apply the selector to the DOM tree. | |
results = sel(tree) | |
# Transform and join the table so you can also keep relation between raw links and other content | |
#print "\n".join([lxml.html.tostring(t) for t in results]) | |
for indice in xrange(1, 235): | |
scrapeAirport(indice) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment