Created
September 28, 2011 19:43
-
-
Save lukaszb/1249033 to your computer and use it in GitHub Desktop.
Top domains fetcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script fetches page from wikipedia, parses it, and outputs | |
top domain names in json format. | |
TODO: Need to add unicode versions of internationalized codes... | |
""" | |
import json | |
import urllib2 | |
from lxml.html import fromstring | |
URL = 'http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains' | |
# File retrieved by ``wget URL`` | |
FILE = 'List_of_Internet_top-level_domains' | |
def get_data(from_wikipedia=False): | |
if from_wikipedia: | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
response = opener.open(URL) | |
return response.read() | |
else: | |
return open(FILE).read() | |
def parse_wiki_page(data): | |
tree = fromstring(data) | |
# second talbe is the main codes, third are internatianlized, thourth are | |
# proposed | |
tables = tree.cssselect('.wikitable') | |
main = tables[1] | |
internationalized = tables[2] | |
proposed = tables[3] | |
intest = tables[4] | |
codes = [] | |
# Main | |
for anchor in main.cssselect('td:first-child a'): | |
codes.append(anchor.text.strip('.')) | |
# Internationalized | |
cells = internationalized.cssselect('td:first-child') | |
codes += [e.text for e in cells] | |
# Also should add unicoded but leave it for now... | |
# Proposed | |
cells = proposed.cssselect('td:first-child') | |
codes += [e.text for e in cells] | |
# Being tested | |
cells = intest.cssselect('td:first-child') | |
codes += [e.text for e in cells] | |
return codes | |
def main(): | |
data = get_data(from_wikipedia=True) | |
codes = parse_wiki_page(data) | |
print json.dumps(codes, indent=2) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment