Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@sanand0
Created December 9, 2012 13:31
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sanand0/4244899 to your computer and use it in GitHub Desktop.
Save sanand0/4244899 to your computer and use it in GitHub Desktop.
Scrapes pypi.python.org modules into a CSV file
*.csv
.cache
"""
Scrape <http://pypi.python.org/> into a CSV file listing
- updated date
- package size
- downloads
By default, it scrapes the Scientific packages.
"""
import os
import sys
import csv
import urllib
import hashlib
import lxml.html
if not os.path.exists('.cache'):
os.makedirs('.cache')
def get(url):
filename = '.cache/' + hashlib.sha1(url).hexdigest()
if not os.path.exists(filename):
urllib.urlretrieve(url, filename)
return lxml.html.parse(filename)
URL = 'http://pypi.python.org/pypi?:action=browse&show=all&c=385' # Scientific packages
# URL = 'http://pypi.python.org/pypi?:action=index' # All packages
tree = get(URL)
packages = [tr.find('.//a') for tr in tree.findall('.//table[@class="list"]//tr')[1:-1]]
out = csv.writer(sys.stdout, lineterminator='\n')
for package in packages:
tree = get('http://pypi.python.org' + package.get('href'))
updated, size, downloads = [], [], []
rows = tree.findall('.//table[@class="list"]//tr')[1:-1]
for row in rows:
cells = row.findall('.//td')
updated.append(cells[-3].text)
size.append(int(cells[-2].text.replace('MB', '000000').replace('KB', '000').replace('B', '')))
downloads.append(int(cells[-1].text))
if len(rows):
out.writerow([package.get('href'), max(updated), max(size), max(downloads)])
sys.stdout.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment