Skip to content

Instantly share code, notes, and snippets.

Created September 28, 2011 19:43
Show Gist options
  • Save lukaszb/1249033 to your computer and use it in GitHub Desktop.
Save lukaszb/1249033 to your computer and use it in GitHub Desktop.
Top domains fetcher
This script fetches page from wikipedia, parses it, and outputs
top domain names in json format.
TODO: Need to add unicode versions of internationalized codes...
import json
import urllib2
from lxml.html import fromstring
URL = ''
# File retrieved by ``wget URL``
FILE = 'List_of_Internet_top-level_domains'
def get_data(from_wikipedia=False):
if from_wikipedia:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response =
return open(FILE).read()
def parse_wiki_page(data):
tree = fromstring(data)
# second talbe is the main codes, third are internatianlized, thourth are
# proposed
tables = tree.cssselect('.wikitable')
main = tables[1]
internationalized = tables[2]
proposed = tables[3]
intest = tables[4]
codes = []
# Main
for anchor in main.cssselect('td:first-child a'):
# Internationalized
cells = internationalized.cssselect('td:first-child')
codes += [e.text for e in cells]
# Also should add unicoded but leave it for now...
# Proposed
cells = proposed.cssselect('td:first-child')
codes += [e.text for e in cells]
# Being tested
cells = intest.cssselect('td:first-child')
codes += [e.text for e in cells]
return codes
def main():
data = get_data(from_wikipedia=True)
codes = parse_wiki_page(data)
print json.dumps(codes, indent=2)
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment