ThomasG77/extract_airports_pdf.py

## extract_airports_pdf.py
# -*- coding: utf8 -*-

import lxml.html
from lxml.cssselect import CSSSelector

# get some html
import requests

def scrapeAirport(page):
    r = requests.get('http://aeronav.faa.gov/new_afd.asp?effDate=24JUL2014&eff=07-24-2014&end=09-18-2014&search=&submit1=Search&select=&StateSearch=ALL&CitySearch=&ChartSearch=&volsearch=&navaidSearch=&page=%s#results' % page)

    # build the DOM Tree
    tree = lxml.html.fromstring(r.text)

    # print the parsed DOM Tree (Optionnal for our purpose)
    #print lxml.html.tostring(tree)

    # construct a CSS Selector
    sel = CSSSelector('.bordered a')

    # Apply the selector to the DOM tree.
    results = sel(tree)

    # Get the url
    print "\n".join(['http://aeronav.faa.gov' + t.get('href') for t in results])

    # If you play with table instead
    # construct a CSS Selector
    sel = CSSSelector('.bordered')

    # Apply the selector to the DOM tree.
    results = sel(tree)

    # Transform and join the table so you can also keep relation between raw links and other content
    #print "\n".join([lxml.html.tostring(t) for t in results])


for indice in xrange(1, 235):
    scrapeAirport(indice)
	# -- coding: utf8 --

	import lxml.html
	from lxml.cssselect import CSSSelector

	# get some html
	import requests

	def scrapeAirport(page):
	r = requests.get('http://aeronav.faa.gov/new_afd.asp?effDate=24JUL2014&eff=07-24-2014&end=09-18-2014&search=&submit1=Search&select=&StateSearch=ALL&CitySearch=&ChartSearch=&volsearch=&navaidSearch=&page=%s#results' % page)

	# build the DOM Tree
	tree = lxml.html.fromstring(r.text)

	# print the parsed DOM Tree (Optionnal for our purpose)
	#print lxml.html.tostring(tree)

	# construct a CSS Selector
	sel = CSSSelector('.bordered a')

	# Apply the selector to the DOM tree.
	results = sel(tree)

	# Get the url
	print "\n".join(['http://aeronav.faa.gov' + t.get('href') for t in results])

	# If you play with table instead
	# construct a CSS Selector
	sel = CSSSelector('.bordered')

	# Apply the selector to the DOM tree.
	results = sel(tree)

	# Transform and join the table so you can also keep relation between raw links and other content
	#print "\n".join([lxml.html.tostring(t) for t in results])


	for indice in xrange(1, 235):
	scrapeAirport(indice)