Skip to content

Instantly share code, notes, and snippets.

@starenka
Created August 30, 2012 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save starenka/3542460 to your computer and use it in GitHub Desktop.
Save starenka/3542460 to your computer and use it in GitHub Desktop.
Gets "top" .cz/sk sites as measured by TOPList.cz/sk
.py[co]
.idea
#!/usr/bin/env python
# pip install pyquery tablib
'''
Gets "top" .cz/sk sites as measured by TOPList.cz/sk
'''
from __future__ import division
import math
from pyquery import PyQuery as pq
import tablib
def get_sites(count=1, tld='cz'):
PER_PAGE = 50
lines = []
count = count if count < 1001 else 1000
for offset in xrange(0, int(math.ceil(count/PER_PAGE))):
page = pq('http://toplist.%(tld)s/all/%(offset)d' % dict(tld=tld, offset=offset * PER_PAGE))
for line in page.find('table.tabulka tr')[1:]:
tds = pq(line).find('td[class^="s"]')[3:5]
if tds:
web, hits = map(pq, tds)
lines.append((web.find('a').attr('href'), web.text(), hits.text()))
return tablib.Dataset(*lines[:count], headers=('url', 'name', 'avg_hits'))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Get top sites from TOPList.cz or TOPList.sk')
parser.add_argument('-c', '--count', action='store', default=50, type=int, help='number of sites (max 1000)')
parser.add_argument('-o', '--output_format', action='store', default='csv', choices=('json', 'yaml', 'csv', 'xls'),
help='format to render data')
parser.add_argument('-t', '--tld', action='store', default='cz', type=str, choices=('cz','sk'),
help='use TopList.cz or Toplist.sk')
args = parser.parse_args()
print getattr(get_sites(count=args.count, tld=args.tld), args.output_format)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment