Skip to content

Instantly share code, notes, and snippets.

@lukaszb
Created September 28, 2011 19:43
Show Gist options
  • Save lukaszb/1249033 to your computer and use it in GitHub Desktop.
Save lukaszb/1249033 to your computer and use it in GitHub Desktop.
Top domains fetcher
"""
This script fetches page from wikipedia, parses it, and outputs
top domain names in json format.
TODO: Need to add unicode versions of internationalized codes...
"""
import json
import urllib2
from lxml.html import fromstring
URL = 'http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains'
# File retrieved by ``wget URL``
FILE = 'List_of_Internet_top-level_domains'
def get_data(from_wikipedia=False):
if from_wikipedia:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(URL)
return response.read()
else:
return open(FILE).read()
def parse_wiki_page(data):
tree = fromstring(data)
# second talbe is the main codes, third are internatianlized, thourth are
# proposed
tables = tree.cssselect('.wikitable')
main = tables[1]
internationalized = tables[2]
proposed = tables[3]
intest = tables[4]
codes = []
# Main
for anchor in main.cssselect('td:first-child a'):
codes.append(anchor.text.strip('.'))
# Internationalized
cells = internationalized.cssselect('td:first-child')
codes += [e.text for e in cells]
# Also should add unicoded but leave it for now...
# Proposed
cells = proposed.cssselect('td:first-child')
codes += [e.text for e in cells]
# Being tested
cells = intest.cssselect('td:first-child')
codes += [e.text for e in cells]
return codes
def main():
data = get_data(from_wikipedia=True)
codes = parse_wiki_page(data)
print json.dumps(codes, indent=2)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment